示例#1
0
        print("Loading pretrained agent")
        # Policy should not be changed
        del hyperparams['policy']

        model = ALGOS[args.algo].load(args.trained_agent,
                                      env=env,
                                      tensorboard_log=tensorboard_log,
                                      verbose=args.verbose,
                                      **hyperparams)

        exp_folder = args.trained_agent[:-4]
        if normalize:
            print("Loading saved running average")
            stats_path = os.path.join(exp_folder, env_id)
            if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')):
                env = VecNormalize.load(
                    os.path.join(stats_path, 'vecnormalize.pkl'), env)
            else:
                # Legacy:
                env.load_running_average(exp_folder)

    elif args.optimize_hyperparameters:

        if args.verbose > 0:
            print("Optimizing hyperparameters")

        def create_model(*_args, **kwargs):
            """
            Helper to create a model with different hyperparameters
            """
            return ALGOS[args.algo](env=create_env(n_envs, no_log=True),
                                    tensorboard_log=tensorboard_log,
示例#2
0
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common import make_vec_env
from stable_baselines import PPO2
from LearningRocket import LearningRocket
import matplotlib.pyplot as plt
from stable_baselines.common.vec_env import VecNormalize
import numpy as np

# multiprocess environment
env = make_vec_env(LearningRocket, n_envs=16)
eval_env = make_vec_env(lambda: LearningRocket(visualize=True), n_envs=1)
#env = VecNormalize(env)
#eval_env = VecNormalize(eval_env)

env = VecNormalize.load("doof_env", env)
eval_env = VecNormalize.load("doof_env", eval_env)

eval_callback = EvalCallback(eval_env,
                             best_model_save_path='Agent007',
                             log_path='./logs/',
                             eval_freq=10000,
                             deterministic=True,
                             render=False,
                             n_eval_episodes=1)

#model = PPO2(MlpPolicy, env, n_steps=1000, nminibatches=32, lam=0.98, gamma=0.999, learning_rate=1e-4,
#                                  noptepochs=4,ent_coef=0.01,verbose=1, tensorboard_log="./rocket_tensorboard/",
#                                  policy_kwargs = dict(layers=[400, 300]))
"""model = PPO2(MlpPolicy, env,verbose=1, tensorboard_log="./rocket_tensorboard/",
                                  policy_kwargs = dict(layers=[400, 300]))"""
示例#3
0
def eval_policy(
    _run,
    _seed: int,
    env_name: str,
    eval_n_timesteps: Optional[int],
    eval_n_episodes: Optional[int],
    num_vec: int,
    parallel: bool,
    render: bool,
    render_fps: int,
    log_dir: str,
    policy_type: str,
    policy_path: str,
    reward_type: Optional[str] = None,
    reward_path: Optional[str] = None,
    max_episode_steps: Optional[int] = None,
):
    """Rolls a policy out in an environment, collecting statistics.

  Args:
    _seed: generated by Sacred.
    env_name: Gym environment identifier.
    eval_n_timesteps: Minimum number of timesteps to evaluate for. Set exactly
        one of `eval_n_episodes` and `eval_n_timesteps`.
    eval_n_episodes: Minimum number of episodes to evaluate for. Set exactly
        one of `eval_n_episodes` and `eval_n_timesteps`.
    num_vec: Number of environments to run simultaneously.
    parallel: If True, use `SubprocVecEnv` for true parallelism; otherwise,
        uses `DummyVecEnv`.
    max_episode_steps: If not None, then environments are wrapped by
        TimeLimit so that they have at most `max_episode_steps` steps per
        episode.
    render: If True, renders interactively to the screen.
    log_dir: The directory to log intermediate output to. (As of 2019-07-19
        this is just episode-by-episode reward from bench.Monitor.)
    policy_type: A unique identifier for the saved policy,
        defined in POLICY_CLASSES.
    policy_path: A path to the serialized policy.
    reward_type: If specified, overrides the environment reward with
        a reward of this.
    reward_path: If reward_type is specified, the path to a serialized reward
        of `reward_type` to override the environment reward with.

  Returns:
    Return value of `imitation.util.rollout.rollout_stats()`.
  """

    os.makedirs(log_dir, exist_ok=True)
    sacred_util.build_sacred_symlink(log_dir, _run)

    tf.logging.set_verbosity(tf.logging.INFO)
    tf.logging.info('Logging to %s', log_dir)
    sample_until = rollout.make_sample_until(eval_n_timesteps, eval_n_episodes)
    venv = util.make_vec_env(env_name,
                             num_vec,
                             seed=_seed,
                             parallel=parallel,
                             log_dir=log_dir,
                             max_episode_steps=max_episode_steps)
    venv = VecNormalize(venv, training=False, norm_reward=False)
    venv = venv.load(policy_path + "/vec_normalize.pkl", venv)

    if render:
        venv = InteractiveRender(venv, render_fps)
    # TODO(adam): add support for videos using VideoRecorder?

    with contextlib.ExitStack() as stack:
        if reward_type is not None:
            reward_fn_ctx = load_reward(reward_type, reward_path, venv)
            reward_fn = stack.enter_context(reward_fn_ctx)
            venv = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn)
            tf.logging.info(
                f"Wrapped env in reward {reward_type} from {reward_path}.")

        with serialize.load_policy(policy_type, policy_path, venv) as policy:
            trajs = rollout.generate_trajectories(policy, venv, sample_until)
    return rollout.rollout_stats(trajs)
示例#4
0
def normalize_env(
    env,
    orig_log_dir,
    sb_version,
    vectorize=True,
    continue_learning=False,
    evaluate=False,
    evaluate_during_learning=False,
    normalize_kwargs=None,
):
    if vectorize:
        env = DummyVecEnv([lambda: env])

    logger.debug("Normalize: {}".format(normalize_kwargs))
    if evaluate:
        # FIXME in continue learning training should be True so that we update the running average of obs and
        #  rewards with new samples; if I do that, the algo performs very poorly even with no changes in the env
        if sb_version == "sb3":
            env = VecNormalize3(env, training=False, **normalize_kwargs)
        else:
            env = VecNormalize(env, training=False, **normalize_kwargs)

        if not evaluate_during_learning or continue_learning:
            if not os.path.exists(
                    os.path.join(orig_log_dir, "vecnormalize.pkl")):
                env_name = get_env_name(env=env.unwrapped,
                                        sb_version=sb_version)
                index_last_separator = orig_log_dir.rindex("/")
                new_orig_log_dir = os.path.join(
                    orig_log_dir[0:index_last_separator], "logs_" + env_name)
                logger.debug(
                    "{} does not exist. Trying to search it in the original model directory {}"
                    .format(os.path.join(orig_log_dir, "vecnormalize.pkl"),
                            new_orig_log_dir))
                assert os.path.exists(new_orig_log_dir), "{} does not exist"
                assert os.path.exists(
                    os.path.join(new_orig_log_dir, "vecnormalize.pkl")), (
                        os.path.join(new_orig_log_dir, "vecnormalize.pkl") +
                        " does not exist")
                logger.debug("[evaluate] Loading {}".format(
                    os.path.join(new_orig_log_dir, "vecnormalize.pkl")))
                if sb_version == "sb3":
                    env = VecNormalize3.load(
                        os.path.join(new_orig_log_dir, "vecnormalize.pkl"),
                        env)
                else:
                    env = VecNormalize.load(
                        os.path.join(new_orig_log_dir, "vecnormalize.pkl"),
                        env)
            else:
                logger.debug("[evaluate] Loading {}".format(
                    os.path.join(orig_log_dir, "vecnormalize.pkl")))
                if sb_version == "sb3":
                    env = VecNormalize3.load(
                        os.path.join(orig_log_dir, "vecnormalize.pkl"), env)
                else:
                    env = VecNormalize.load(
                        os.path.join(orig_log_dir, "vecnormalize.pkl"), env)

        # Deactivate training and reward normalization
        env.training = False
        env.norm_reward = False

    elif continue_learning:
        # FIXME: don't know why but during continue learning I have to disable training otherwise performance
        #  is not the same as in the model trained from scratch even without changing the params of the environment.
        #  in rl-baselines-zoo this is not done during continue learning:
        #  https://github.com/araffin/rl-baselines-zoo/blob/master/train.py#L365
        if sb_version == "sb3":
            env = VecNormalize3(env, training=False, **normalize_kwargs)
        else:
            env = VecNormalize(env, training=False, **normalize_kwargs)

        assert os.path.exists(os.path.join(
            orig_log_dir, "vecnormalize.pkl")), (
                os.path.join(orig_log_dir, "vecnormalize.pkl") +
                " does not exist")
        logger.debug("[continue_learning] Loading {}".format(
            os.path.join(orig_log_dir, "vecnormalize.pkl")))
        if sb_version == "sb3":
            env = VecNormalize3.load(
                os.path.join(orig_log_dir, "vecnormalize.pkl"), env)
        else:
            env = VecNormalize.load(
                os.path.join(orig_log_dir, "vecnormalize.pkl"), env)

    else:
        if sb_version == "sb3":
            env = VecNormalize3(env, **normalize_kwargs)
        else:
            env = VecNormalize(env, **normalize_kwargs)

    return env
        h_shape = (10, 10, 1024)  # Shape of the hidden state of the lstm network
        history_shape = 20  # Number of past actions to be tracked
        env = InattEnv(args, h_shape=h_shape, history_shape=history_shape, is_test=True, dynamic_gamma=dynamic_lambda_0)
        if dynamic_lambda_0:
            print("INFO: Gamma has been set to " + str(args.lambda_0))
            env.lambda_0 = args.lambda_0

        env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run
        env = VecNormalize(env, norm_obs=True, norm_reward=False)

        # Create RL policy
        policy = PPO2.load(os.path.join(args.rl_path, "best_agent.zip"), env=env, verbose=1)

        if args.normalize_env:
            if os.path.exists(os.path.join(args.rl_path, 'vecnormalize.pkl')):
                env = VecNormalize.load(os.path.join(args.rl_path, 'vecnormalize.pkl'), env)
                env.training = False
                env.norm_reward = False
            else:
                raise Exception("Normalization parameters not found")

        print("INFO: Loaded model " + os.path.join(args.rl_path, "best_agent.zip"))
    else:
        raise Exception("Policy type not recognized")

    # Active for plotting images
    plot_image = True

    if plot_image:
        def on_trackbar(val):
            if args.policy == 'baseline':
示例#6
0
def create_test_env(env_id,
                    n_envs=1,
                    is_atari=False,
                    stats_path=None,
                    seed=0,
                    log_dir='',
                    should_render=True,
                    hyperparams=None,
                    env_kwargs=None):
    """
    Create environment for testing a trained agent

    :param env_id: (str)
    :param n_envs: (int) number of processes
    :param is_atari: (bool)
    :param stats_path: (str) path to folder containing saved running averaged
    :param seed: (int) Seed for random number generator
    :param log_dir: (str) Where to log rewards
    :param should_render: (bool) For Pybullet env, display the GUI
    :param env_wrapper: (type) A subclass of gym.Wrapper to wrap the original
                        env with
    :param hyperparams: (dict) Additional hyperparams (ex: n_stack)
    :param env_kwargs: (Dict[str, Any]) Optional keyword argument to pass to the env constructor
    :return: (gym.Env)
    """
    # HACK to save logs
    if log_dir is not None:
        os.environ["OPENAI_LOG_FORMAT"] = 'csv'
        os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir)
        os.makedirs(log_dir, exist_ok=True)
        logger.configure()

    if hyperparams is None:
        hyperparams = {}

    if env_kwargs is None:
        env_kwargs = {}

    # Create the environment and wrap it if necessary
    env_wrapper = get_wrapper_class(hyperparams)
    if 'env_wrapper' in hyperparams.keys():
        del hyperparams['env_wrapper']

    if is_atari:
        print("Using Atari wrapper")
        env = make_atari_env(env_id, num_env=n_envs, seed=seed)
        # Frame-stacking with 4 frames
        env = VecFrameStack(env, n_stack=4)
    elif n_envs > 1:
        # start_method = 'spawn' for thread safe
        env = SubprocVecEnv([
            make_env(env_id,
                     i,
                     seed,
                     log_dir,
                     wrapper_class=env_wrapper,
                     env_kwargs=env_kwargs) for i in range(n_envs)
        ])
    # Pybullet envs does not follow gym.render() interface
    elif "Bullet" in env_id:
        # HACK: force SubprocVecEnv for Bullet env
        env = SubprocVecEnv([
            make_env(env_id,
                     0,
                     seed,
                     log_dir,
                     wrapper_class=env_wrapper,
                     env_kwargs=env_kwargs)
        ])
    else:
        env = DummyVecEnv([
            make_env(env_id,
                     0,
                     seed,
                     log_dir,
                     wrapper_class=env_wrapper,
                     env_kwargs=env_kwargs)
        ])

    # Load saved stats for normalizing input and rewards
    # And optionally stack frames
    if stats_path is not None:
        if hyperparams['normalize']:
            print("Loading running average")
            print("with params: {}".format(hyperparams['normalize_kwargs']))
            env = VecNormalize(env,
                               training=False,
                               **hyperparams['normalize_kwargs'])

            if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')):
                env = VecNormalize.load(
                    os.path.join(stats_path, 'vecnormalize.pkl'), env)
                # Deactivate training and reward normalization
                env.training = False
                env.norm_reward = False
            else:
                # Legacy:
                env.load_running_average(stats_path)

        n_stack = hyperparams.get('frame_stack', 0)
        if n_stack > 0:
            print("Stacking {} frames".format(n_stack))
            env = VecFrameStack(env, n_stack)
    return env
示例#7
0
def create_test_env(env_id,
                    n_envs=1,
                    is_atari=False,
                    stats_path=None,
                    seed=0,
                    log_dir='',
                    should_render=True,
                    hyperparams=None):
    """
    Create environment for testing a trained agent

    :param env_id: (str)
    :param n_envs: (int) number of processes
    :param is_atari: (bool)
    :param stats_path: (str) path to folder containing saved running averaged
    :param seed: (int) Seed for random number generator
    :param log_dir: (str) Where to log rewards
    :param should_render: (bool) For Pybullet env, display the GUI
    :param env_wrapper: (type) A subclass of gym.Wrapper to wrap the original
                        env with
    :param hyperparams: (dict) Additional hyperparams (ex: n_stack)
    :return: (gym.Env)
    """
    # HACK to save logs
    if log_dir is not None:
        os.environ["OPENAI_LOG_FORMAT"] = 'csv'
        os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir)
        os.makedirs(log_dir, exist_ok=True)
        logger.configure()

    if hyperparams is None:
        hyperparams = {}

    # Create the environment and wrap it if necessary
    env_wrapper = get_wrapper_class(hyperparams)
    if 'env_wrapper' in hyperparams.keys():
        del hyperparams['env_wrapper']

    if is_atari:
        print("Using Atari wrapper")
        env = make_atari_env(env_id, num_env=n_envs, seed=seed)
        # Frame-stacking with 4 frames
        env = VecFrameStack(env, n_stack=4)
    elif n_envs > 1:
        # start_method = 'spawn' for thread safe
        env = SubprocVecEnv([
            make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper)
            for i in range(n_envs)
        ])
    # Pybullet envs does not follow gym.render() interface
    elif "Bullet" in env_id:
        spec = gym.envs.registry.env_specs[env_id]
        try:
            class_ = load(spec.entry_point)
        except AttributeError:
            # Backward compatibility with gym
            class_ = load(spec._entry_point)
        # HACK: force SubprocVecEnv for Bullet env that does not
        # have a render argument
        render_name = None
        use_subproc = 'renders' not in inspect.getfullargspec(
            class_.__init__).args
        if not use_subproc:
            render_name = 'renders'
        # Dev branch of pybullet
        # use_subproc = use_subproc and 'render' not in inspect.getfullargspec(class_.__init__).args
        # if not use_subproc and render_name is None:
        #     render_name = 'render'

        # Create the env, with the original kwargs, and the new ones overriding them if needed
        def _init():
            # TODO: fix for pybullet locomotion envs
            env = class_(**{**spec._kwargs}, **{render_name: should_render})
            env.seed(0)
            if log_dir is not None:
                env = Monitor(env,
                              os.path.join(log_dir, "0"),
                              allow_early_resets=True)
            return env

        if use_subproc:
            env = SubprocVecEnv([
                make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)
            ])
        else:
            env = DummyVecEnv([_init])
    else:
        env = DummyVecEnv(
            [make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)])

    # Load saved stats for normalizing input and rewards
    # And optionally stack frames
    if stats_path is not None:
        if hyperparams['normalize']:
            print("Loading running average")
            print("with params: {}".format(hyperparams['normalize_kwargs']))
            env = VecNormalize(env,
                               training=False,
                               **hyperparams['normalize_kwargs'])

            if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')):
                env = VecNormalize.load(
                    os.path.join(stats_path, 'vecnormalize.pkl'), env)
                # Deactivate training and reward normalization
                env.training = False
                env.norm_reward = False
            else:
                # Legacy:
                env.load_running_average(stats_path)

        n_stack = hyperparams.get('frame_stack', 0)
        if n_stack > 0:
            print("Stacking {} frames".format(n_stack))
            env = VecFrameStack(env, n_stack)
    return env
示例#8
0
    def __init__(self, algorithm="SAC", load=True, agent_name="Agent001"):
        self.agent_name = agent_name

        #self.env = LearningRocket(visualize=False)
        #self.env = NormalizeActionWrapper(self.env)

        #self.eval_env = LearningRocket(visualize=True)
        #self.eval_env = NormalizeActionWrapper(self.eval_env)

        #self.env = SubprocVecEnv([lambda: LearningRocket(visualize=False) for i in range(4)])
        self.env = make_vec_env(
            LearningRocket, n_envs=16
        )  #[lambda: LearningRocket(visualize=False) for i in range(16)]))
        #self.eval_env = VecNormalize(DummyVecEnv([lambda: LearningRocket(visualize=True) for i in range(1)]))
        self.eval_env = make_vec_env(lambda: LearningRocket(visualize=True),
                                     n_envs=1)
        #self.eval_env = VecNormalize(self.eval_env)
        self.eval_callback = EvalCallback(self.eval_env,
                                          best_model_save_path='Agent007',
                                          log_path='./logs/',
                                          eval_freq=10000,
                                          deterministic=True,
                                          render=False,
                                          n_eval_episodes=1)
        kai_policy = dict(act_fun=tf.nn.tanh, net_arch=[400, 300])
        #check_env(self.env, warn=True)
        """
        if algorithm == "SAC":
            if load is True:
                self.model = SAC.load(agent_name, env=self.env, tensorboard_log="./rocket_tensorboard/")
                #self.model.ent_coef=0.2
            else:
                self.model = SAC('MlpPolicy', self.env, verbose=1, tensorboard_log="./rocket_tensorboard/",ent_coef=5)
            print("Trainer Set for SAC")
        """
        if algorithm == "TD3":
            n_actions = self.env.action_space.shape[-1]
            action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                             sigma=0.1 * np.ones(n_actions))
            if load is True:
                self.model = TD3.load(agent_name,
                                      env=self.env,
                                      tensorboard_log="./rocket_tensorboard/")
                #file = open('replay_buffer', 'rb')
                #self.model.replay_buffer = pickle.load(file)
                #file.close()
            else:
                self.model = TD3(MlpPolicy,
                                 self.env,
                                 action_noise=action_noise,
                                 batch_size=768,
                                 gamma=0.95,
                                 learning_rate=1e-4,
                                 learning_starts=20000,
                                 verbose=1,
                                 tensorboard_log="./rocket_tensorboard/",
                                 policy_kwargs=dict(layers=[400, 300]))
            print("Trainer Set for TD3")
        elif algorithm == "PPO2":
            if load is True:
                self.model = PPO2.load(agent_name,
                                       env=self.env,
                                       tensorboard_log="./rocket_tensorboard/")
                self.eval_env = VecNormalize.load(self.agent_name + "vEnv",
                                                  self.eval_env)
                #self.eval_env.clip_obs = 500
                #self.env = VecNormalize(self.env)
                self.env = VecNormalize.load(self.agent_name + "vEnv",
                                             self.env)
                #self.env.clip_obs = 500
                #self.env.norm_obs = False
                #self.eval_env.norm_obs = False
            else:
                self.model = PPO2(PPOMlpPolicy,
                                  self.env,
                                  n_steps=1024,
                                  nminibatches=32,
                                  lam=0.98,
                                  gamma=0.999,
                                  noptepochs=4,
                                  ent_coef=0.01,
                                  verbose=1,
                                  tensorboard_log="./rocket_tensorboard/",
                                  policy_kwargs=dict(layers=[400, 300]))
                self.eval_env = VecNormalize(self.eval_env)
                self.env = VecNormalize(self.env)
                #self.eval_env.clip_obs = 500
                #self.env.clip_obs = 500
                #self.env.norm_obs=False
                #self.eval_env.norm_obs=False

                print("Trainer set for PPO2. I am speed.")
示例#9
0
def main():
    try:
        difficulty = int(sys.argv[1])
        initial_pose_json = sys.argv[2]
        goal_pose_json = sys.argv[3]
        output_file = sys.argv[4]
    except IndexError:
        print("Incorrect number of arguments.")
        print("Usage:\n"
              "\tevaluate_policy.py <difficulty_level> <initial_pose>"
              " <goal_pose> <output_file>")
        sys.exit(1)

    # the poses are passes as JSON strings, so they need to be converted first
    initial_pose = move_cube.Pose.from_json(initial_pose_json)
    goal_pose = move_cube.Pose.from_json(goal_pose_json)

    # create a FixedInitializer with the given values
    initializer = FixedInitializer(difficulty, initial_pose, goal_pose)

    # TODO: Replace with your environment if you used a custom one.
    env = CubeEnv(frameskip=5,
                  visualization=False,
                  initializer=initializer,
                  action_type=ActionType.POSITION,
                  observation_type=ObservationType.WITHOUT_GOALS,
                  testing=True)
    env = FrameStackWrapper(TimeFeatureWrapper(FlatObservationWrapper(env)), 4)
    #env = TimeFeatureWrapper(FlatObservationWrapper(env))

    norm_env = VecNormalize.load("models/normalized_env_frame_stacked_model",
                                 DummyVecEnv([lambda: env]))

    if difficulty == 1:
        policy = SAC.load(
            "models/checkpoint_saves/CONTINUE_SAC_09_19_2020_01_55_01__1000000_steps.zip"
        )
    elif difficulty == 2:
        policy = SAC.load(
            "models/checkpoint_saves/CONTINUE_SAC_09_19_2020_01_55_06__1000000_steps.zip"
        )
    elif difficulty == 3:
        policy = SAC.load(
            "models/checkpoint_saves/CONTINUE_SAC_09_19_2020_01_55_14__1000000_steps.zip"
        )
    elif difficulty == 4:
        policy = SAC.load(
            "models/checkpoint_saves/CONTINUE_SAC_09_19_2020_01_55_18__1000000_steps.zip"
        )

    # Execute one episode.  Make sure that the number of simulation steps
    # matches with the episode length of the task.  When using the default Gym
    # environment, this is the case when looping until is_done == True.  Make
    # sure to adjust this in case your custom environment behaves differently!
    is_done = False
    observation = env.reset()
    accumulated_reward = 0
    while not is_done:
        action, _ = policy.predict(np.expand_dims(
            norm_env.normalize_obs(observation), axis=0),
                                   deterministic=True)
        observation, reward, is_done, info = env.step(action[0])
        accumulated_reward += reward

    print("Accumulated reward: {}".format(accumulated_reward))

    # store the log for evaluation
    env.platform.store_action_log(output_file)
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common import make_vec_env
from stable_baselines import PPO2
from TestHover.LearningRocketHover import LearningRocket
import matplotlib.pyplot as plt
from stable_baselines.common.vec_env import VecNormalize
import numpy as np

# multiprocess environment
env = make_vec_env(LearningRocket, n_envs=16)
eval_env = make_vec_env(lambda: LearningRocket(visualize=True), n_envs=1)
#env = VecNormalize(env)
#eval_env = VecNormalize(eval_env)

env = VecNormalize.load("TestHover_env", env)
eval_env = VecNormalize.load("TestHover_env", eval_env)

eval_callback = EvalCallback(eval_env,
                             best_model_save_path='Agent007',
                             log_path='./logs/',
                             eval_freq=10000,
                             deterministic=True,
                             render=False,
                             n_eval_episodes=1)

model = PPO2(MlpPolicy,
             env,
             n_steps=1000,
             nminibatches=32,
             lam=0.98,
    return _init


if __name__ == '__main__':
    log_dir = 'models/hover/empty_world_small/finalVec'
    stats_path = os.path.join(log_dir, "vec_normalize.pkl")
    env_id = 'CrazyflieObstacleEval-v0'

    # Load the agent
    model = PPO2.load(log_dir + '/ppo2_final')

    # Load the saved statistics
    env = DummyVecEnv([
        lambda: gym.make(env_id, n_obstacles=1, avoidance_method='Heuristic')
    ])
    env = VecNormalize.load(stats_path, env)
    #  do not update them at test time
    env.training = False
    # reward normalization is not needed at test time
    env.norm_reward = False

    eval_episodes = 50

    total_goals_reached = 0
    total_collisions = 0
    total_flips = 0
    total_steps_exceeded = 0
    total_potential_collisions = 0
    total_collisions_avoided = 0
    total_timsteps = 0
示例#12
0
    def learn(self):
        # Use deterministic actions for evaluation
        eval_path = self.model_dir + "/best_model"
        # TODO save checkpoints with vecnormalize callback pkl file
        save_vec_normalize = SaveVecNormalizeCallback(save_freq=1, save_path=eval_path)
        if self.norm:
            # Don't normalize the reward for test env
            self.test_env = VecNormalize(self.test_env, norm_obs=True, norm_reward=False,
                                        clip_obs=10.)
        eval_callback = EvalCallback(self.test_env, best_model_save_path=eval_path,
                                    log_path=eval_path+'/logs', eval_freq=50000,
                                    n_eval_episodes=10, callback_on_new_best=save_vec_normalize,
                                    deterministic=True, render=False)
        checkpoint_callback = CheckpointCallback(save_freq=25000, save_path=self.model_dir+'/logs/',
                                         name_prefix='rl_model')
        time_callback = TrainingTimeCallback()
        tensorboard_file = None if self.config[self.algo]['tensorboard_logs'] is None else "tensorboard_logs/"+self.model_dir
        if self.algo == 'SAC':
            if not self.env.envs[0].is_simplified() and (self.env.envs[0].depth_obs or self.env.envs[0].full_obs):
                policy_kwargs = {
                    "layers": self.config[self.algo]['layers'],
                    "cnn_extractor": custom_obs_policy.create_augmented_nature_cnn(1)}
                policy = sacCnn
            elif self.env.envs[0].depth_obs or self.env.envs[0].full_obs:
                policy_kwargs = {}
                policy = sacCnn
            else:
                policy_kwargs = {"layers": self.config[self.algo]['layers'], "layer_norm": False}
                policy = sacMlp
            if self.load_dir:
                top_folder_idx = self.load_dir.rfind('/')
                top_folder_str = self.load_dir[0:top_folder_idx]
                if self.norm:
                    self.env = VecNormalize(self.env, training=True, norm_obs=False, norm_reward=False,
                                            clip_obs=10.)
                    self.env = VecNormalize.load(os.path.join(top_folder_str, 'vecnormalize.pkl'), self.env)
                model = sb.SAC(policy,
                            self.env,
                            policy_kwargs=policy_kwargs,
                            verbose=1,
                            gamma=self.config['discount_factor'],
                            buffer_size=self.config[self.algo]['buffer_size'],
                            batch_size=self.config[self.algo]['batch_size'],
                            learning_rate=self.config[self.algo]['step_size'],
                            tensorboard_log=tensorboard_file)
                model_load = sb.SAC.load(self.load_dir, self.env)
                params = model_load.get_parameters()
                model.load_parameters(params, exact_match=False)
            else:
                if self.norm:
                    self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True,
                                            clip_obs=10.)
                model = sb.SAC(policy,
                            self.env,
                            policy_kwargs=policy_kwargs,
                            verbose=2,
                            gamma=self.config['discount_factor'],
                            buffer_size=self.config[self.algo]['buffer_size'],
                            batch_size=self.config[self.algo]['batch_size'],
                            learning_rate=self.config[self.algo]['step_size'],
                            tensorboard_log=tensorboard_file)
        elif self.algo == 'TRPO':
            model = sb.TRPO(MlpPolicy, 
                            self.env, 
                            verbose=2,
                            gamma=self.config['discount_factor'],
                            timesteps_per_batch=self.config[self.algo]['max_iters'],
                            vf_stepsize=self.config[self.algo]['step_size'],
                            tensorboard_log=tensorboard_file)
        elif self.algo == 'PPO':
            if not self.env.envs[0].is_simplified() and (self.env.envs[0].depth_obs or self.env.envs[0].full_obs):
                policy_kwargs = {
                    "layers": self.config[self.algo]['layers'],
                    "cnn_extractor": custom_obs_policy.create_augmented_nature_cnn(1)}
                policy = CnnPolicy
            elif self.env.envs[0].depth_obs or self.env.envs[0].full_obs:
                policy_kwargs = {}
                policy = CnnPolicy
            else:
                policy_kwargs = {"layers": self.config[self.algo]['layers'], "layer_norm": False}
                policy = MlpPolicy
            model = sb.PPO2(MlpPolicy, 
                            self.env, 
                            verbose=2,
                            gamma=self.config['discount_factor'],
                            learning_rate=self.config[self.algo]['learning_rate'],
                            tensorboard_log=tensorboard_file)
        elif self.algo == 'DQN':
            if self.load_dir:
                model = self.load_params()
            else:
                model = sb.DQN(DQNMlpPolicy, 
                            self.env, 
                            verbose=2,
                            gamma=self.config['discount_factor'],
                            batch_size=self.config[self.algo]['batch_size'],
                            prioritized_replay=self.config[self.algo]['prioritized_replay'],
                            tensorboard_log=tensorboard_file)
        elif self.algo == "DDPG":
            param_noise = AdaptiveParamNoiseSpec()
            model = sb.DDPG(ddpgMlp,
                            self.env,
                            verbose=2,
                            gamma=self.config['discount_factor'],
                            param_noise=param_noise,
                            tensorboard_log=tensorboard_file)
        try:
            model.learn(total_timesteps=int(self.config[self.algo]['total_timesteps']), 
                        callback=[TensorboardCallback(self.env, tensorboard_file, self.algo, self.log_freq, self.model_dir), 
                                   eval_callback])
        except KeyboardInterrupt:
            pass

        self.save(model, self.model_dir)
示例#13
0
def train(method="SAC"):
    def get_multi_process_env(num_of_envs,
                              subprocess=True,
                              amplitude_scaling=False,
                              frameskip=5,
                              with_goals=False,
                              action_type=ActionType.POSITION,
                              difficulty=1,
                              initializer="random",
                              testing=False):

        if initializer == "random":
            initializer = RandomInitializer(difficulty=difficulty)
        elif initializer == "completely_random":
            initializer = CompletelyRandomInitializer()

        def _make_env(rank):
            def _init():
                obs_type = ObservationType.WITH_GOALS if with_goals else ObservationType.WITHOUT_GOALS
                out_env = CubeEnv(frameskip=frameskip,
                                  visualization=False,
                                  initializer=initializer,
                                  action_type=action_type,
                                  observation_type=obs_type,
                                  testing=testing)
                out_env.seed(seed=54321)
                out_env.action_space.seed(seed=54321)
                if not with_goals:
                    out_env = FlatObservationWrapper(
                        out_env, amplitude_scaling=amplitude_scaling)
                    out_env = TimeFeatureWrapper(out_env,
                                                 max_steps=math.ceil(
                                                     3750 / frameskip))
                else:
                    out_env = GoalObservationWrapper(
                        out_env, amplitude_scaling=amplitude_scaling)
                return out_env

            return _init

        if subprocess:
            return SubprocVecEnv(
                [_make_env(rank=i) for i in range(num_of_envs)])
        else:
            return DummyVecEnv([_make_env(rank=i) for i in range(num_of_envs)])

    date_time_str = datetime.now().strftime("%m_%d_%Y_%H_%M_%S_")
    print(method, date_time_str)
    set_global_seeds(0)

    if method == "HER":
        env = get_multi_process_env(1,
                                    subprocess=False,
                                    amplitude_scaling=True,
                                    frameskip=5,
                                    with_goals=True)
        env.set_attr("reward_range", 1000)
        policy_kwargs = dict(layers=[128, 128], act_fun=tf.tanh)

        n_actions = env.action_space.shape[-1]
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.2) *
                                                    np.ones(n_actions))

        model = HER("MlpPolicy",
                    env,
                    SAC,
                    policy_kwargs=policy_kwargs,
                    n_sampled_goal=4,
                    goal_selection_strategy='future',
                    verbose=1,
                    tensorboard_log="tblogs",
                    batch_size=512,
                    buffer_size=100000,
                    gamma=0.98,
                    learning_starts=10000,
                    random_exploration=0.15)
        model.learn(int(2e6),
                    log_interval=10,
                    callback=CheckpointCallback(
                        save_freq=int(1e5),
                        save_path='models/checkpoint_saves',
                        name_prefix=method + '_' + date_time_str),
                    tb_log_name=method + '_' + date_time_str)
    if method == "SAC":
        env = VecNormalize(VecFrameStack(
            get_multi_process_env(1,
                                  subprocess=False,
                                  amplitude_scaling=False,
                                  frameskip=5,
                                  action_type=ActionType.POSITION,
                                  difficulty=1,
                                  initializer="completely_random"), 4),
                           norm_reward=False,
                           clip_reward=1500,
                           gamma=0.99)
        policy_kwargs = dict(layers=[256, 256])

        n_actions = env.action_space.shape[-1]
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.2) *
                                                    np.ones(n_actions))
        model = SAC("LnMlpPolicy",
                    env,
                    policy_kwargs=policy_kwargs,
                    buffer_size=1000000,
                    batch_size=256,
                    gamma=0.99,
                    learning_rate=LinearSchedule(int(2e6),
                                                 5e-5,
                                                 initial_p=3e-4).value,
                    train_freq=64,
                    gradient_steps=4,
                    tau=0.005,
                    learning_starts=10000,
                    tensorboard_log="tblogs",
                    verbose=1,
                    use_emph_exp=True,
                    action_noise=action_noise)
        model.learn(int(2e6),
                    log_interval=10,
                    callback=CheckpointCallback(
                        save_freq=int(5e5),
                        save_path='models/checkpoint_saves',
                        name_prefix=method + '_' + date_time_str),
                    tb_log_name=method + '_' + date_time_str)
        env.save("normalized_env_" + date_time_str)
    if method == "CONTINUE_SAC":
        difficulty = 4
        env = VecNormalize.load(
            "models/normalized_env_frame_stacked_model",
            VecFrameStack(
                get_multi_process_env(1,
                                      subprocess=False,
                                      amplitude_scaling=True,
                                      frameskip=5,
                                      action_type=ActionType.POSITION,
                                      difficulty=difficulty,
                                      initializer="random",
                                      testing=True), 4))

        model = SAC.load(
            "models/checkpoint_saves/SAC_09_18_2020_19_07_42__1000000_steps.zip",
            env=env,
            tensorboard_log="tblogs",
        )
        model.learn(int(1e6),
                    log_interval=10,
                    callback=CheckpointCallback(
                        save_freq=int(5e5),
                        save_path='models/checkpoint_saves',
                        name_prefix=method + '_' + date_time_str),
                    tb_log_name=method + '_' + date_time_str)
        env.save("normalized_env_difficulty_" + str(difficulty))
        model.save(
            os.path.join('models', "model_difficulty_" + str(difficulty)))
    if method == "save_vec_env":
        env = VecNormalize(
            get_multi_process_env(1,
                                  subprocess=False,
                                  amplitude_scaling=True,
                                  frameskip=5,
                                  action_type=ActionType.POSITION,
                                  difficulty=1,
                                  initializer="completely_random"))

        model = SAC.load(
            "models/checkpoint_saves/SAC_09_18_2020_14_27_30__2000000_steps.zip",
            env=env)
        model.learn(int(1e5), log_interval=1)
        env.save("normalized_env_without_framestack")
        return
    else:
        return

    print("save model: ", os.path.join('models', method + '_' + date_time_str))
示例#14
0
def main():
    # SIM_ENV_NAME = 'InvertedPendulum-v2'
    # REAL_ENV_NAME = 'InvertedPendulumModified-v2_old'

    # expt_label = input('Enter a label for the experiment : ')

    parser = argparse.ArgumentParser(
        description='Reinforced Grounded Action Transformation')
    parser.add_argument(
        '--target_policy_algo',
        default="TRPO",
        type=str,
        help="name in str of the agent policy training algorithm")
    parser.add_argument(
        '--action_tf_policy_algo',
        default="TRPO",
        type=str,
        help="name in str of the Action Transformer policy training algorithm")
    parser.add_argument(
        '--load_policy_path',
        default='data/models/TRPO_initial_policy_steps_Hopper-v2_1000000_.pkl',
        help="relative path of initial policy trained in sim")
    parser.add_argument('--alpha',
                        default=1.0,
                        type=float,
                        help="Deprecated feature. Ignore")
    parser.add_argument('--beta',
                        default=1.0,
                        type=float,
                        help="Deprecated feature. Ignore")
    parser.add_argument(
        '--n_trainsteps_target_policy',
        default=1000000,
        type=int,
        help=
        "Number of time steps to train the agent policy in the grounded environment"
    )
    parser.add_argument(
        '--n_trainsteps_action_tf_policy',
        default=1000000,
        type=int,
        help=
        "Timesteps to train the Action Transformer policy in the ATPEnvironment"
    )
    parser.add_argument(
        '--num_cores',
        default=10,
        type=int,
        help="Number of threads to use while collecting real world experience")
    parser.add_argument('--sim_env',
                        default='InvertedPendulum-v2',
                        help="Name of the simulator environment (Unmodified)")
    parser.add_argument('--real_env',
                        default='InvertedPendulumModified-v2',
                        help="Name of the Real World environment (Modified)")
    parser.add_argument(
        '--n_frames',
        default=1,
        type=int,
        help="Number of previous frames observed by discriminator")
    parser.add_argument(
        '--expt_number',
        default=1,
        type=int,
        help="Expt. number to keep track of multiple experiments")
    parser.add_argument(
        '--n_grounding_steps',
        default=1,
        type=int,
        help="Number of grounding steps. (Outerloop of algorithm ) ")
    parser.add_argument('--n_iters_atp',
                        default=20,
                        type=int,
                        help="Number of GAN iterations")
    parser.add_argument('--discriminator_epochs',
                        default=5,
                        type=int,
                        help="Discriminator epochs per GAN iteration")
    parser.add_argument('--generator_epochs',
                        default=50,
                        type=int,
                        help="ATP epochs per GAN iteration")
    parser.add_argument('--real_trajs',
                        default=100,
                        type=int,
                        help="Set max amount of real TRAJECTORIES used")
    parser.add_argument('--sim_trajs',
                        default=100,
                        type=int,
                        help="Set max amount of sim TRAJECTORIES used")
    parser.add_argument('--real_trans',
                        default=5000,
                        type=int,
                        help="amount of real world transitions used")
    parser.add_argument('--gsim_trans',
                        default=5000,
                        type=int,
                        help="amount of simulator transitions used")
    parser.add_argument('--debug', action='store_true', help="DEPRECATED")
    parser.add_argument(
        '--eval',
        action='store_true',
        help=
        "set to true to evaluate the agent policy in the real environment, after training in grounded environment"
    )
    parser.add_argument('--use_cuda',
                        action='store_true',
                        help="DEPRECATED. Not using CUDA")
    parser.add_argument('--instance_noise',
                        action='store_true',
                        help="DEPRECATED. Not using instance noise")
    parser.add_argument(
        '--ent_coeff',
        default=0.00005,
        type=float,
        help=
        "entropy coefficient for the PPO algorithm, used to train the action transformer policy"
    )
    parser.add_argument(
        '--max_kl',
        default=0.000005,
        type=float,
        help="Set this only if using TRPO for the action transformer policy")
    parser.add_argument(
        '--clip_range',
        default=0.1,
        type=float,
        help="PPO objective clipping factor -> Action transformer policy")
    parser.add_argument('--use_condor', action='store_true', help="UNUSABLE")
    parser.add_argument(
        '--plot',
        action='store_true',
        help=
        "visualize the action transformer policy - works well only for simple environments"
    )
    parser.add_argument('--tensorboard',
                        action='store_true',
                        help="visualize training in tensorboard")
    parser.add_argument('--save_atp',
                        action='store_true',
                        help="Saves the action transformer policy")
    parser.add_argument('--save_target_policy',
                        action='store_true',
                        help="saves the agent policy")
    parser.add_argument('--debug_discriminator',
                        action='store_true',
                        help="UNUSED")
    parser.add_argument('--use_eval_callback',
                        action='store_true',
                        help="UNUSED")
    parser.add_argument(
        '--loss_function',
        default="GAIL",
        type=str,
        help="choose from the list: ['GAIL', 'WGAN', 'AIRL', 'FAIRL']")
    parser.add_argument('--reset_disc_only',
                        action='store_true',
                        help="UNUSED")
    parser.add_argument('--namespace',
                        default="wed_night",
                        type=str,
                        help="namespace for the experiments")
    parser.add_argument('--dont_reset', action='store_true', help="UNUSED")
    parser.add_argument('--reset_target_policy',
                        action='store_true',
                        help="UNUSED")
    parser.add_argument('--randomize_target_policy',
                        action='store_true',
                        help="UNUSED")
    parser.add_argument(
        '--compute_grad_penalty',
        action='store_true',
        help=
        "set this to true to compute the GP term while training the discriminator"
    )
    parser.add_argument(
        '--single_batch_test',
        action='store_true',
        help="performs a single update of the generator and discriminator.")
    parser.add_argument('--folder_namespace',
                        default="None",
                        type=str,
                        help="UNUSED")
    parser.add_argument(
        '--disc_lr',
        default=3e-3,
        type=float,
        help="learning rate for the AdamW optimizer to update the discriminator"
    )
    parser.add_argument(
        '--atp_lr',
        default=3e-4,
        type=float,
        help="learning rate for the Adam optimizer to update the agent policy")
    parser.add_argument(
        '--nminibatches',
        default=4,
        type=int,
        help=
        "Number of minibatches used by the PPO algorithm to update the action transformer policy"
    )
    parser.add_argument(
        '--noptepochs',
        default=4,
        type=int,
        help=
        "Number of optimization epochs performed per minibatch by the PPO algorithm to update the action transformer policy"
    )
    parser.add_argument(
        '--deterministic',
        default=0,
        type=int,
        help=
        "set to 0 to use the deterministic action transformer policy in the grounded environment"
    )
    parser.add_argument('--single_batch_size',
                        default=0,
                        type=int,
                        help="batch size for the GARAT update")

    args = parser.parse_args()

    # set the seeds here for experiments
    random.seed(args.expt_number)
    np.random.seed(args.expt_number)
    torch.manual_seed(args.expt_number)

    # if args.wgan: args.loss_function = 'WGAN'

    # make dummy gym environment
    dummy_env = gym.make(args.real_env)

    if args.dont_reset is True and args.reset_disc_only is True:
        raise ValueError(
            'Cannot have both args dont_reset and reset_disc_only. Choose one.'
        )

    expt_type = 'sim2sim' if args.sim_env == args.real_env else 'sim2real'
    expt_label = args.namespace + args.loss_function + '_' + expt_type + '_' + args.target_policy_algo + '_' + str(
        args.n_trainsteps_target_policy) + '_' + str(
            args.real_trans) + '_' + str(args.n_iters_atp) + '_' + str(
                args.expt_number)

    # create the experiment folder
    if args.use_condor:
        if args.folder_namespace is "None":
            expt_path = '/u/' + args.real_env + '/' + expt_label
        else:
            expt_path = '/u/' + args.folder_namespace + '/' + expt_label
    else:
        expt_path = 'data/models/garat/' + expt_label
    expt_already_running = False

    gatworld = ReinforcedGAT(
        load_policy=args.load_policy_path,
        num_cores=args.num_cores,
        sim_env_name=args.sim_env,
        real_env_name=args.real_env,
        expt_label=expt_label,
        frames=args.n_frames,
        algo=args.target_policy_algo,
        atp_algo=args.action_tf_policy_algo,
        debug=args.debug,
        real_trajs=args.real_trajs,
        sim_trajs=args.sim_trajs,
        use_cuda=args.use_cuda,
        real_trans=args.real_trans,
        gsim_trans=args.gsim_trans,
        expt_path=expt_path,
        tensorboard=args.tensorboard,
        atp_loss_function=args.loss_function,
        single_batch_size=None
        if args.single_batch_size == 0 else args.single_batch_size,
    )

    # checkpointing logic ~~ necessary when deploying script on Condor cluster
    if os.path.exists(expt_path):
        print('~~ Resuming from checkpoint ~~')

        # remove the best_model.zip file if it exists
        if os.path.exists(expt_path + '/best_model.zip'):
            os.remove(expt_path + '/best_model.zip')

        expt_already_running = True
        grounding_step = len(glob.glob(expt_path + '/*.pkl'))
        print('found ', grounding_step, ' target policies in disk')
        if grounding_step == args.n_grounding_steps:  # training has ended
            raise ValueError('Rerunning same experiment again ! Exiting')
        else:
            if grounding_step > 0:
                print('reloading weights of the target policy')
                gatworld.load_model(expt_path + '/target_policy_' +
                                    str(grounding_step - 1) + '.pkl')
    else:
        print('First time running experiment')
        os.makedirs(expt_path)
        grounding_step = 0

        with open(expt_path + '/commandline_args.txt', 'w') as f:
            f.write('\n'.join(sys.argv[1:]))

    start_grouding_step = grounding_step

    if args.reset_disc_only or args.dont_reset:
        cprint('~~ INITIALIZING DISCRIMINATOR AND ATP POLICY ~~', 'yellow')
        gatworld._init_rgat_models(
            algo=args.action_tf_policy_algo,
            ent_coeff=args.ent_coeff,
            max_kl=args.max_kl,
            clip_range=args.clip_range,
            atp_loss_function=args.loss_function,
            disc_lr=args.disc_lr,
            atp_lr=args.atp_lr,
            nminibatches=args.nminibatches,
            noptepochs=args.noptepochs,
        )

    for _ in range(args.n_grounding_steps - start_grouding_step):
        grounding_step += 1

        gatworld.collect_experience_from_real_env()

        cprint('~~ RESETTING DISCRIMINATOR AND ATP POLICY ~~', 'yellow')
        gatworld._init_rgat_models(
            algo=args.action_tf_policy_algo,
            ent_coeff=args.ent_coeff,
            max_kl=args.max_kl,
            clip_range=args.clip_range,
            atp_loss_function=args.loss_function,
            disc_lr=args.disc_lr,
            atp_lr=args.atp_lr,
            nminibatches=args.nminibatches,
            noptepochs=args.noptepochs,
        )

        # ground the environment
        for ii in range(args.n_iters_atp):
            print('################### GROUNDING INNER ITERATION : ', ii,
                  ' ###################')
            for _ in range(args.discriminator_epochs):
                gatworld.train_discriminator(
                    iter_step=ii,
                    grounding_step=grounding_step,
                    num_epochs=args.noptepochs *
                    5 if ii <= 10 else args.noptepochs,  # warmup
                    inject_instance_noise=args.instance_noise,
                    compute_grad_penalty=args.compute_grad_penalty,
                    nminibatches=args.nminibatches,
                    single_batch_test=args.single_batch_test,
                    debug_discriminator=args.debug_discriminator,
                )

            gatworld.train_action_transformer_policy(
                beta=args.beta,
                num_epochs=args.generator_epochs,
                loss_function=args.loss_function,
                single_batch_test=args.single_batch_test,
            )

            # test grounded environment
            if args.plot and dummy_env.action_space.shape[0] < 5:
                # action transformer plot
                gatworld.test_grounded_environment(
                    alpha=args.alpha,
                    grounding_step=str(grounding_step) + '_' + str(ii),
                )
            else:
                print('Environment has action space > 5. Skipping AT plotting')

            if args.save_atp:
                # save the action transformer policy for further analysis
                gatworld.save_atp(grounding_step=str(grounding_step) + '_' +
                                  str(ii))
                # gatworld.save_grounded_env(grounding_step=str(grounding_step) + '_' + str(ii))

        # if args.randomize_target_policy:
        #     gatworld._randomize_target_policy(algo=args.target_policy_algo)

        gatworld.train_target_policy_in_grounded_env(
            grounding_step=grounding_step,
            alpha=args.alpha,
            time_steps=args.n_trainsteps_target_policy,
            use_eval_callback=args.use_eval_callback,
            save_model=args.save_target_policy,
            use_deterministic=True if args.deterministic == 1 else False,
        )

        if args.eval:
            cprint('Evaluating target policy in environment .. ', 'red',
                   'on_blue')
            test_env = gym.make(args.real_env)
            if 'mujoco_norm' in args.load_policy_path:
                test_env = MujocoNormalized(test_env)
            elif 'normalized' in args.load_policy_path:
                test_env = DummyVecEnv([lambda: test_env])
                test_env = VecNormalize.load('data/models/env_stats/' +
                                             args.sim_env + '.pkl',
                                             venv=test_env)
            # evaluate on the real world.
            try:
                val = evaluate_policy_on_env(test_env,
                                             gatworld.target_policy,
                                             render=False,
                                             iters=20,
                                             deterministic=True)

                with open(expt_path + "/output.txt", "a") as txt_file:
                    print(val, file=txt_file)

                val = evaluate_policy_on_env(test_env,
                                             gatworld.target_policy,
                                             render=False,
                                             iters=20,
                                             deterministic=False)

                with open(expt_path + "/stochastic_output.txt",
                          "a") as txt_file:
                    print(val, file=txt_file)
            except Exception as e:
                cprint(e, 'red')

    # expt done, now get the green and red lines
    if args.eval:
        # green line
        cprint('**~~vv^^ GETTING GREEN AND RED LINES ^^vv~~**', 'red',
               'on_green')
        test_env = gym.make(args.real_env)
        if 'mujoco_norm' in args.load_policy_path:
            test_env = MujocoNormalized(test_env)
        elif 'normalized' in args.load_policy_path:
            test_env = DummyVecEnv([lambda: test_env])
            test_env = VecNormalize.load('data/models/env_stats/' +
                                         args.sim_env + '.pkl',
                                         venv=test_env)

        sim_policy = 'data/models/' + args.target_policy_algo + '_initial_policy_steps_' + args.sim_env + '_1000000_.pkl'
        real_policy = 'data/models/' + args.target_policy_algo + '_initial_policy_steps_' + args.real_env + '_1000000_.pkl'

        if 'HalfCheetah' in args.load_policy_path or 'Reacher' in args.load_policy_path:
            sim_policy = sim_policy.replace('1000000_.pkl', '2000000_.pkl')
            real_policy = real_policy.replace('1000000_.pkl', '2000000_.pkl')

        # if 'Walker2d' in args.load_policy_path:
        #     sim_policy = sim_policy.replace('1000000_.pkl', '2000000_mujoco_norm_.pkl')
        #     real_policy = real_policy.replace('1000000_.pkl', '2000000_mujoco_norm_.pkl')

        if 'mujoco_norm' in args.load_policy_path:
            sim_policy = sim_policy.replace('1000000_.pkl',
                                            '2000000_mujoco_norm_.pkl')
            real_policy = real_policy.replace('1000000_.pkl',
                                              '2000000_mujoco_norm_.pkl')

        elif 'normalized' in args.load_policy_path:
            sim_policy = sim_policy.replace('1000000_.pkl',
                                            '1000000_normalized_.pkl')
            real_policy = real_policy.replace('1000000_.pkl',
                                              '1000000_normalized_.pkl')

        if args.target_policy_algo == 'PPO2':
            algo = PPO2
        elif args.target_policy_algo == 'TRPO':
            algo = TRPO

        val = evaluate_policy_on_env(test_env,
                                     algo.load(sim_policy),
                                     render=False,
                                     iters=10,
                                     deterministic=True)
        with open(expt_path + "/green_red.txt", "a") as txt_file:
            print(val, file=txt_file)

        # red line
        del algo  # remove the old algo and reload it.
        if args.target_policy_algo == 'PPO2':
            algo = PPO2
        elif args.target_policy_algo == 'TRPO':
            algo = TRPO

        val = evaluate_policy_on_env(test_env,
                                     algo.load(real_policy),
                                     render=False,
                                     iters=10,
                                     deterministic=True)
        with open(expt_path + "/green_red.txt", "a") as txt_file:
            print(val, file=txt_file)

    os._exit(0)