Пример #1
0
def train(hours):
    conn = Connection()
    env = Monitor(SpireEnv(conn), "./tmp/")
    env.reset()
    logdir = "./tboard_log"
    try:
        model = MODEL_CLASS.load(MODEL_NAME, env=env, tensorboard_log=logdir)
    except FileNotFoundError:
        model = MODEL_CLASS(MlpPolicy, env, tensorboard_log=logdir, **KWARGS)
    start = time.time()

    steps_per_hour = 7000
    steps = steps_per_hour * hours

    callback = TensorboardCallback(env)
    model.learn(total_timesteps=steps,
                reset_num_timesteps=False,
                callback=callback)
    model.save(MODEL_NAME)

    elapsed = time.time() - start
    print(f"{steps} steps processed")
    print(f"{timedelta(seconds=elapsed)} time elapsed")
    print(f"{env.total_floors} floors climbed")
    print(f"{env.total_games} games played")
    if env.total_games > 0:
        print("{:.2f} floors per game".format(env.total_floors /
                                              env.total_games))
def test_monitor(tmp_path):
    """
    Test the monitor wrapper
    """
    env = gym.make("CartPole-v1")
    env.seed(0)
    monitor_file = os.path.join(
        str(tmp_path),
        "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4()))
    monitor_env = Monitor(env, monitor_file)
    monitor_env.reset()
    total_steps = 1000
    ep_rewards = []
    ep_lengths = []
    ep_len, ep_reward = 0, 0
    for _ in range(total_steps):
        _, reward, done, _ = monitor_env.step(
            monitor_env.action_space.sample())
        ep_len += 1
        ep_reward += reward
        if done:
            ep_rewards.append(ep_reward)
            ep_lengths.append(ep_len)
            monitor_env.reset()
            ep_len, ep_reward = 0, 0

    monitor_env.close()
    assert monitor_env.get_total_steps() == total_steps
    assert sum(ep_lengths) == sum(monitor_env.get_episode_lengths())
    assert sum(monitor_env.get_episode_rewards()) == sum(ep_rewards)
    _ = monitor_env.get_episode_times()

    with open(monitor_file, "rt") as file_handler:
        first_line = file_handler.readline()
        assert first_line.startswith("#")
        metadata = json.loads(first_line[1:])
        assert metadata["env_id"] == "CartPole-v1"
        assert set(metadata.keys()) == {"env_id", "t_start"
                                        }, "Incorrect keys in monitor metadata"

        last_logline = pandas.read_csv(file_handler, index_col=None)
        assert set(
            last_logline.keys()) == {"l", "t",
                                     "r"}, "Incorrect keys in monitor logline"
    os.remove(monitor_file)
Пример #3
0
def train_ppo(itr=0, timesteps=1e7, use_dummy_video = True):
	env = flappy_env.FlappyEnv(use_dummy_video)
	env = Monitor(env, f"flappy_ppo_{itr}")
	obs = env.reset()
	model = PPO(
		"CnnPolicy", 
		env, 
		verbose=1, 
		learning_rate=1e-5,
		tensorboard_log = f"./ppo_flappy_tensorboard_{itr}/")
	model.learn(total_timesteps = timesteps)
	model.save(f"ppo_flappy_{itr}")
def test_monitor_load_results(tmp_path):
    """
    test load_results on log files produced by the monitor wrapper
    """
    tmp_path = str(tmp_path)
    env1 = gym.make("CartPole-v1")
    env1.seed(0)
    monitor_file1 = os.path.join(
        tmp_path, "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4()))
    monitor_env1 = Monitor(env1, monitor_file1)

    monitor_files = get_monitor_files(tmp_path)
    assert len(monitor_files) == 1
    assert monitor_file1 in monitor_files

    monitor_env1.reset()
    episode_count1 = 0
    for _ in range(1000):
        _, _, done, _ = monitor_env1.step(monitor_env1.action_space.sample())
        if done:
            episode_count1 += 1
            monitor_env1.reset()

    results_size1 = len(load_results(os.path.join(tmp_path)).index)
    assert results_size1 == episode_count1

    env2 = gym.make("CartPole-v1")
    env2.seed(0)
    monitor_file2 = os.path.join(
        tmp_path, "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4()))
    monitor_env2 = Monitor(env2, monitor_file2)
    monitor_files = get_monitor_files(tmp_path)
    assert len(monitor_files) == 2
    assert monitor_file1 in monitor_files
    assert monitor_file2 in monitor_files

    monitor_env2.reset()
    episode_count2 = 0
    for _ in range(1000):
        _, _, done, _ = monitor_env2.step(monitor_env2.action_space.sample())
        if done:
            episode_count2 += 1
            monitor_env2.reset()

    results_size2 = len(load_results(os.path.join(tmp_path)).index)

    assert results_size2 == (results_size1 + episode_count2)

    os.remove(monitor_file1)
    os.remove(monitor_file2)
Пример #5
0
def train_dqn(itr = 0, timesteps = 1e7, use_dummy_video = True):
	env = flappy_env.FlappyEnv(use_dummy_video)
	env = Monitor(env, f"flappy_dqn_{itr}")
	obs = env.reset()
	model = DQN(
		"CnnPolicy", 
		env, 
		verbose = 1, 
		optimize_memory_usage = True, 
		buffer_size = 500000, 
		learning_rate = 1e-5, 
		tensorboard_log = f"./dqn_flappy_tensorboard_{itr}/")
	model.learn(total_timesteps = timesteps)
	model.save(f"dqn_flappy_{itr}")
Пример #6
0
def test(model, test_images):
    test_env = Monitor(
        PuzzleEnv(images=test_images,
                  img_size=IMG_SIZE,
                  channel_num=CHANNEL_NUM,
                  puzzle_size=(3, 3),
                  puzzle_type="switch",
                  dist_type="manhattan",
                  penalty_for_step=-0.2,
                  reward_for_completiton=20,
                  positive_reward_coefficient=1.0,
                  obs_conf=OBS_CONF))

    solutions = []
    rews = []
    steps = []
    sample = len(test_images)
    errors = 0

    for iter in range(sample):
        i = 0
        done = False
        obs = test_env.reset()
        frames = [obs]

        while not done:
            i += 1
            action, _states = model.predict(obs)
            obs, rewards, done, info = test_env.step(action)
            frames.append(obs)
            rews.append(rewards)

            if i == 10000:
                errors += 1
                break

        solutions.append(frames)
        done = False
        print(i, sum(rews), rews)
        rews = []
        steps.append(i)

    print('Average steps taken:  ', sum(steps) / sample)
    print('Median of steps taken: ', statistics.median(steps))
    print('Number of errors: ', errors)
    plt.hist(steps, bins=9)
    plt.savefig('fig.png')
Пример #7
0
    def learn(self, initial_models):
        mesa_algo = TD3(
            "MlpPolicy", self.env, verbose=1, learning_starts=1
        )  # Note: Unecessarily initializes parameters (could speed up a bit by fixing)'

        mesa_algo.set_parameters(to_torch(initial_models), exact_match=False)
        LOG_DIR = "/home/jet/catkin_ws/src/marsha/marsha_ai/training/logs/"
        MODEL_DIR = "/home/jet/catkin_ws/src/marsha/marsha_ai/training/models/"

        callback_list = []
        callback_list.append(TensorboardCallback())
        callback_list.append(
            StopTrainingOnMaxEpisodes(max_episodes=5, verbose=1))
        """callback_list.append(EvalCallback(self.env, best_model_save_path=MODEL_DIR, log_path=LOG_DIR,
                                    deterministic=True,
                                    eval_freq=5,
                                    n_eval_episodes=1))"""
        mesa_algo.learn(total_timesteps=1000, callback=callback_list
                        )  #rospy.get_param("/hyperparameters/total_timesteps")

        print("finished training! Testing mesa network...")
        test_buffer = ReplayBuffer(100,
                                   TaskEnv.observation_space,
                                   TaskEnv.action_space,
                                   device="cuda")

        test_env = Monitor(self.env)
        done = False
        ob = test_env.reset()
        while not done:
            action, state = mesa_algo.predict(ob)
            next_ob, reward, done, info = test_env.step(action)
            test_buffer.add(ob, next_ob, action, reward, done, [info])
            ob = next_ob

        meta_buffer = {"test": test_buffer, "train": mesa_algo.replay_buffer}

        optimized_mesa_parameters = mesa_algo.get_parameters()
        tf_mesa_models = from_torch(optimized_mesa_parameters)

        return meta_buffer, tf_mesa_models
Пример #8
0
def objective(trial):
    # gym environment & variables

    env = gym.make(env_id)
    # Parallel environments
    # env = make_vec_env(gym.make(env_id), n_envs=4)
    os.makedirs(logs_base_dir, exist_ok=True)
    env = Monitor(env, logs_base_dir)

    global episodes
    global mean_reward
    episodes = 0
    mean_reward = 0

    batch_size = trial.suggest_categorical(
        "batch_size", [8, 16, 32, 64, 128, 256, 512])
    n_steps = trial.suggest_categorical(
        "n_steps", [256, 512, 1024, 2048, 4096])
    gamma = trial.suggest_categorical(
        "gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform("lr", 2e-4, 6e-4)
    lr_schedule = "constant"

    ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1)
    clip_range = trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4])
    n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20])
    gae_lambda = trial.suggest_categorical(
        "gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
    max_grad_norm = trial.suggest_categorical(
        "max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
    vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "large"])
    log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
    sde_sample_freq = trial.suggest_categorical(
        "sde_sample_freq", [-1, 8, 16, 32, 64, 128, 256])
    ortho_init = False
    ortho_init = trial.suggest_categorical('ortho_init', [False, True])
    activation_fn = trial.suggest_categorical(
        'activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])

    net_arch = {
        "small": [dict(pi=[64, 64], vf=[64, 64])],
        "medium": [dict(pi=[128, 128], vf=[128, 128])],
        "large": [dict(pi=[256, 256], vf=[256, 256])],
    }[net_arch]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU,
                     "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]


    #Create the policy_kwargs
    #Create the model_kwargs 
    #Create the callback

    #Store the policy_kwargs into log_tensorboard
    #Store the model_kwargs into log_tensorboard
    
    model = PPO(
        MlpPolicy,
        env,
        n_steps=n_steps,
        batch_size=batch_size,
        gamma=gamma,
        learning_rate=learning_rate,
        ent_coef=ent_coef,
        clip_range=clip_range,
        n_epochs=n_epochs,
        gae_lambda=gae_lambda,
        max_grad_norm=max_grad_norm,
        vf_coef=vf_coef,
        sde_sample_freq=sde_sample_freq,
        policy_kwargs=dict(
            log_std_init=log_std_init,
            net_arch=net_arch,
            activation_fn=activation_fn,
            ortho_init=ortho_init,

        ),
        tensorboard_log=log_tensorboard,
        verbose=0
    )

    # ======================================================================== Hyper Parameters

    # ======================================================================== Evaluation

    class RewardCallback(BaseCallback):

        """
        Callback for saving a model (the check is done every ``check_freq`` steps)
        based on the training reward (in practice, we recommend using ``EvalCallback``).

        :param check_freq: (int)
        :param log_dir: (str) Path to the folder where the model will be saved.
        It must contains the file created by the ``Monitor`` wrapper.
        :param verbose: (int)
        """

        def __init__(self, check_freq: int, log_dir: str, verbose=1):
            super(RewardCallback, self).__init__(verbose)
            self.check_freq = check_freq
            self.log_dir = log_dir
            self.save_path = os.path.join(log_dir, 'best_model')
            self.best_mean_reward = -np.inf

        def _init_callback(self) -> None:
            # Create folder if needed
            if self.save_path is not None:
                os.makedirs(self.save_path, exist_ok=True)

        def _on_step(self) -> bool:
            if self.n_calls % self.check_freq == 0:

                # Retrieve training reward
                x, y = ts2xy(load_results(self.log_dir), 'timesteps')
                if len(x) > 0:
                    global episodes
                    global mean_reward
                    global best_reward
                    episodes = len(y)
                    # print(episodes)
                    mean_reward = np.mean(y[-50:])
                    mean_reward = round(mean_reward, 0)
                    if self.verbose > 0:
                        print(f"Episodes: {episodes}")
                        print(f"Num steps: {self.num_timesteps}")
                        print(f"Mean reward: {mean_reward:.2f} ")
                        print("=================================")
                    # Report intermediate objective value to Optima and Handle pruning
                    trial.report(mean_reward,self.num_timesteps)
                    if trial.should_prune():
                        raise optuna.TrialPruned()

                    # New best model, you could save the agent here
                    if mean_reward > best_reward:
                        best_reward = mean_reward
                        if mean_reward > reward_threshold:
                            print("REWARD ACHIVED")
                            model.save(f"{self.save_path}/reward_achived_{str(mean_reward)}")
                            return False
                        else:
                            model.save(f"{self.save_path}/best_model")
                        

                    # New best model, you could save the agent here
                    # if episodes > episodes_threshold:
                    #     print("REWARD ACHIVED")
                    #     model.save(self.save_path)
                    #     return False



            return True

    # ======================================================================== Training

    check_freq = int(timesteps/10) if int(timesteps/10)>0 else 1
    callback = RewardCallback(check_freq=check_freq, log_dir=logs_base_dir)
    model.learn(total_timesteps=int(timesteps), callback=callback)


    # ==== Rest environment
    del model
    env.reset()

    return mean_reward
Пример #9
0
    verbose=1,
    buffer_size=int(1e6),
    learning_rate=1e-3,
    gamma=0.95,
    tensorboard_log="./her_overcooked",
    batch_size=256,
    online_sampling=online_sampling,
    action_noise=action_noise,
    # policy_kwargs=dict(net_arch=[256, 256, 256]),
)

# model = HER.load('./her_bit_env250.zip', env=env)
# Train the model
for i in range(1000):
    model.learn(10000)
    model.save(f"./her_bit_env{i}")

# model = HER.load('./her_bit_env', env=env)

obs = env.reset()
for _ in range(100):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    env.render()
    episode_reward += reward
    if done or info.get("is_success", False):
        print("Reward:", episode_reward, "Success?",
              info.get("is_success", False))
        episode_reward = 0.0
        obs = env.reset()
Пример #10
0
class OffPolicy_BaseLine(RLSPAgent):
    """
    RLSP DDPG Agent
    This class creates a DDPG agent with params for RLSP
    """
    def __init__(self, agent_helper):
        self.agent_helper = agent_helper
        # create model
        #TODO: add number of env for multiple processing later for faster traing:
        self.create()
        pass

    def create(self, n_envs=1):
        """Create the agent"""
        self.env = self.agent_helper.env
        log_dir = self.agent_helper.config_dir
        os.makedirs(log_dir, exist_ok=True)
        self.env = Monitor(self.env, log_dir)
        #TODO:
        # Create DDPG policy and define its hyper parameter here! even the action space and observation space.
        # add policy
        policy_name = self.agent_helper.config['policy']
        self.policy = eval(policy_name)
        # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
        n_actions = int(self.agent_helper.env.action_space.shape[0])
        action_noise = NormalActionNoise(
            mean=np.zeros(n_actions),
            sigma=self.agent_helper.config['rand_sigma'] * np.ones(n_actions))

        #FIXME: test:
        # self.model = DDPG("MlpPolicy", self.env, action_noise=action_noise, verbose=1, tensorboard_log=self.agent_helper.graph_path)

        # TODO: fix the obvervation space and action space later. Test if the obervation space input is correct? Output action space is correct?
        # activ_function_name = self.agent_helper.config['nn_activ']
        # activ_function = eval(activ_function_name)

        # policy_kwargs = dict(activation_fn=activ_function,
        #              net_arch=[dict(pi=[32, 32], qf=[32, 32])])
        policy_kwargs = dict(net_arch=self.agent_helper.config['layers'])

        self.model = OffPolicyAlgorithm(
            self.policy,
            self.env,
            learning_rate=self.agent_helper.config['learning_rate'],
            buffer_size=self.agent_helper.config['buffer_size'],
            batch_size=self.agent_helper.config['batch_size'],
            tau=self.agent_helper.config['tau'],
            gamma=self.agent_helper.config['gamma'],
            gradient_steps=self.agent_helper.config['gradient_steps'],
            action_noise=action_noise,
            optimize_memory_usage=self.agent_helper.
            config['optimize_memory_usage'],
            create_eval_env=self.agent_helper.config['create_eval_env'],
            policy_kwargs=policy_kwargs,
            verbose=self.agent_helper.config['verbose'],
            learning_starts=self.agent_helper.config['learning_starts'],
            tensorboard_log=self.agent_helper.graph_path,
            seed=self.agent_helper.seed)
        pass

    def test_env(self):
        logger.info(f"Model: {self.model.get_env()}")

    def fit(self,
            env,
            episodes,
            verbose,
            episode_steps,
            callbacks,
            log_interval,
            agent_id=-1):
        """Mask the agent fit function
        To train the agent
        """
        logger.info("herer")
        # self.model.learn(total_timesteps=100, log_interval=10)
        #FIXME: use the tb logname meaningful!

        #TODO: Write callback funcs here:
        # List of callback:
        # Checkpoint Callback: save the model every 10 episodes.
        checkpoint_callback = CheckpointCallback(
            save_freq=96,
            save_path=self.agent_helper.config_dir,
            name_prefix='rl_model')
        # Eval Callback: evaluate every eval_freq, save the best model to best_model_save_path.
        eval_env = env
        eval_callback = EvalCallback(eval_env,
                                     best_model_save_path='./logs/',
                                     log_path='./logs/',
                                     eval_freq=500,
                                     deterministic=True,
                                     render=False)
        # StopTrainingOnRewardThreshold: stop the training on reward threshold, show that this is good enough
        callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=70,
                                                         verbose=1)
        eval_callback_reward_threshold = EvalCallback(
            eval_env, callback_on_new_best=callback_on_best, verbose=1)
        # EveryNTimeSteps: to call every n time steps to save the model.
        checkpoint_on_event = CheckpointCallback(save_freq=1,
                                                 save_path='./logs/')
        event_callback_after_n_steps = EveryNTimesteps(
            n_steps=500, callback=checkpoint_on_event)

        # StopTrainingOnMaxEpisodes:
        # Stops training when the model reaches the maximum number of episodes
        callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=5,
                                                          verbose=1)

        # CallbackList: to call several callback together.
        callbacklist = CallbackList([checkpoint_callback, eval_callback])

        logger.info(f"Model: {self.model.get_env()}")
        with ProgressBarManager(log_interval) as progress_callback:
            self.model.learn(total_timesteps=log_interval,
                             callback=[progress_callback, checkpoint_callback])
        # mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        # self.eval_writer(mean_reward, std_reward)
        pass

    def test(self, env, episodes, verbose, episode_steps, callbacks, sim):
        """Mask the agent fit function"""
        logger.info(f"episodes: {episodes}, episode_steps: {episode_steps}")
        if self.agent_helper.train:
            # Create a fresh simulator with test argument
            logger.info("Create new Environment!")
            self.agent_helper.env.simulator = create_simulator(
                self.agent_helper)
        obs = self.env.reset()
        self.setup_writer()
        self.setup_run_writer()
        episode = 1
        step = 0
        episode_reward = 0.0
        done = False
        # action, _states = self.model.predict(obs)
        # obs, reward, dones, info = self.env.step(action)
        # logger.info(f"info: {info}")

        # Test for 1 episode
        while not done:
            action, _states = self.model.predict(obs)
            obs, reward, dones, info = self.env.step(action)
            episode_reward += reward
            self.write_run_reward(step, reward)
            if sim:
                step = info['sim_time']
                if step >= (self.agent_helper.episode_steps *
                            self.agent_helper.n_steps_per_episode):
                    done = True
                self.write_reward(episode, episode_reward)
            else:
                step = info['step']
                if step >= self.agent_helper.episode_steps:
                    done = True
                self.write_reward(episode, episode_reward)
                # episode += 1

            # # sys.stdout.write(
            #     "\rTesting:" +
            #     f"Current Simulator Time: {step}. Testing duration: {self.agent_helper.episode_steps}\n")
            # sys.stdout.flush()
        # print("")
        pass

    def save_weights(self, file, overwrite=True):
        weights_file = f"{file}weights"
        dir_path = os.path.dirname(os.path.realpath(weights_file))
        os.makedirs(dir_path, exist_ok=True)

        # After training is done, we save the final weights in the result_base_path.
        logger.info("saving model and weights to %s", weights_file)
        # self.agent.save_weights(weights_file, overwrite)
        self.model.save(weights_file)
        pass

    def load_weights(self, weights_file):
        """ Load the model from a zip archive """
        self.model = OffPolicyAlgorithm.load(weights_file)
        pass

    def setup_writer(self):
        episode_reward_filename = f"{self.agent_helper.config_dir}/episode_reward.csv"
        episode_reward_header = ['episode', 'reward']
        self.episode_reward_stream = open(episode_reward_filename,
                                          'a+',
                                          newline='')
        self.episode_reward_writer = csv.writer(self.episode_reward_stream)
        self.episode_reward_writer.writerow(episode_reward_header)

    def setup_run_writer(self):
        run_reward_filename = f"{self.agent_helper.config_dir}/run_reward.csv"
        run_reward_header = ['run', 'reward']
        self.run_reward_stream = open(run_reward_filename, 'a+', newline='')
        self.run_reward_writer = csv.writer(self.run_reward_stream)
        self.run_reward_writer.writerow(run_reward_header)

    def write_reward(self, episode, reward):
        self.episode_reward_writer.writerow([episode, reward])

    def write_run_reward(self, step, reward):
        self.run_reward_writer.writerow([step, reward])

    def eval_writer(self, mean_reward, std_reward):
        episode_reward_filename = f"{self.agent_helper.config_dir}evaluate_agent.csv"
        episode_reward_header = ['mean_reward', 'std_reward']
        self.episode_reward_stream = open(episode_reward_filename,
                                          'a+',
                                          newline='')
        self.episode_reward_writer = csv.writer(self.episode_reward_stream)
        self.episode_reward_writer.writerow(episode_reward_header)
        self.episode_reward_writer.writerow([mean_reward, std_reward])

    def eval_writer(self, mean_reward, std_reward):
        episode_reward_filename = f"{self.agent_helper.config_dir}evaluate_agent.csv"
        episode_reward_header = ['mean_reward', 'std_reward']
        self.episode_reward_stream = open(episode_reward_filename,
                                          'a+',
                                          newline='')
        self.episode_reward_writer = csv.writer(self.episode_reward_stream)
        self.episode_reward_writer.writerow(episode_reward_header)
        self.episode_reward_writer.writerow([mean_reward, std_reward])
Пример #11
0
def main(args, unknown_args):  # noqa: C901


    # path to the configuration file 
    path = os.path.join(script_dir,'configs', args.config)
    
    # check if the algorithm is implemented     
    if  args.algo not in ALGOS:   
        raise NotImplementedError('the algorithm specified has not been recognized !!')

    # parsing the config file and the args parser 
    config_file = configparser.ConfigParser()
    config_file.read(path)
    n_timesteps = config_file.getint('ADAPT','total_timesteps')
    env_id = config_file['ADAPT']['environment']
    n_eval_episodes = 5
    n_eval_test = 5
    eval_freq = 10
    n_trials = 20

    # Create the saving directory
    log_folder = os.path.join(script_dir,'saved_models')

    algo = args.algo
    folder = log_folder

    # if args.exp_id == 0:
    #     args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id)
    #     print(f"Loading latest experiment, id={args.exp_id}")

    # Sanity checks
    if args.exp_id > 0:
        log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}")
    else:
        log_path = os.path.join(folder, algo)

    assert os.path.isdir(log_path), f"The {log_path} folder was not found"


    if args.load_best:
        model_path = os.path.join(log_path, "best_model.zip")
        found = os.path.isfile(model_path)

    if not found:
        raise ValueError(f"No model found for {algo} on {env_id}, path: {model_path}")

    off_policy_algos = ["qrdqn", "dqn", "ddpg", "sac", "her", "td3", "tqc"]

    if algo in off_policy_algos:
        args.n_envs = 1

    set_random_seed(args.seed)


    stats_path = os.path.join(log_path, env_id)
    hyperparams, stats_path = get_saved_hyperparams(stats_path, norm_reward=False, test_mode=True)

    # load env_kwargs if existing
    env_kwargs = {}
    args_path = os.path.join(log_path, env_id, "args.yml")
    if os.path.isfile(args_path):
        with open(args_path, "r") as f:
            loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader)  # pytype: disable=module-attr
            if loaded_args["env_kwargs"] is not None:
                env_kwargs = loaded_args["env_kwargs"]
 
    env = Monitor(gym.make(f"deep_calibration:{env_id}"), log_path)
    eval_env = NormalizeActionWrapper(env)

    kwargs = dict(seed=args.seed)
    if algo in off_policy_algos:
        # Dummy buffer size as we don't need memory to enjoy the trained agent
        kwargs.update(dict(buffer_size=1))

    # Check if we are running python 3.8+
    # we need to patch saved model under python 3.6/3.7 to load them
    newer_python_version = sys.version_info.major == 3 and sys.version_info.minor >= 8

    custom_objects = {}
    if newer_python_version:
        custom_objects = {
            "learning_rate": 0.0,
            "lr_schedule": lambda _: 0.0,
            "clip_range": lambda _: 0.0,
        }

    best_model = ALGOS[algo].load(model_path, env=env, custom_objects=custom_objects, **kwargs)

    obs = env.reset()

    np.set_printoptions(precision=5, suppress=True)
    try:
        # sample an observation from the environment and compute the action
        dists = []
        actions = []
        for i in range(n_eval_episodes):
            obs = eval_env.reset()
            action = best_model.predict(obs, deterministic = True)[0]
            action = eval_env.rescale_action(action)
            actions.append(action)
            dist = eval_env.distance_to_goal(action)
            print(f'distance to goal for config {i} = {dist:.6f}')
            dists.append(dist)
            # print(f'parameters for config {i} is {action}')


        print(f'mean distance = {np.mean(dists):.6f}')

        ind = np.argmin(dists)
        best_action = actions[ind]
        print(f'best distance = {dists[ind]:.6f}')
        print(f'best action =  {best_action}')

        std_actions = std( actions , best_action) 

        print(f'std actions =  {std_actions}')
        print('########################################################')

        for i, action in enumerate(actions):
            print(f'config {i}')
            print(f'action = {action}')
            std_actions = std( actions , action) 
            print(f'std actions =  {std_actions}')
            dists = []
            for i in range(n_eval_episodes):
                obs = eval_env.reset()
                dist = eval_env.distance_to_goal(action)
                dists.append(dist)
                print(f'distance to goal for config {i} = {dist:.6f}')
            print(f'mean distance =  {np.mean(dists):.6f}')


        # # testing for random configurations
        # eval_env.rand = 1
        # dists = []
        # for i in range(n_eval_test):
        #     obs = eval_env.reset()
        #     action = best_model.predict(obs, deterministic = True)[0]
        #     action = eval_env.rescale_action(action)

        #     dist = eval_env.distance_to_goal(action)
        #     print(f'best distance to goal for a random config {i} is  {dist}')
        #     dists.append(dist)

        # print('best random mean distance: ', np.mean(dists))

    except KeyboardInterrupt:
        pass