예제 #1
0
def run():
    torch.multiprocessing.freeze_support()
    env_id = "CartPole-v1"
    num_cpu = 4  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    model = ACKTR(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=25000)

    obs = env.reset()
    for _ in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
예제 #2
0
def test_action_mask_run_acktr(vec_env, policy, env_class):
    env = vec_env([env_class])

    model = ACKTR(policy, env, verbose=0)

    obs, done, action_masks = env.reset(), [False], []
    while not done[0]:
        action, _states = model.predict(obs, action_mask=action_masks)
        obs, _, done, infos = env.step(action)

        action_masks.clear()
        for info in infos:
            env_action_mask = info.get('action_mask')
            action_masks.append(env_action_mask)

    env.close()
예제 #3
0
def optimize_agent(trial):
    """ Train the model and optimise
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    model_params = optimize_acktr(trial)
    seed = trial.suggest_int('numpyseed', 1, 429496729)
    np.random.seed(seed)
    original_env = gym.make('rustyblocks-v0')
    original_env.max_invalid_tries = 3
    env = DummyVecEnv([lambda: original_env])
    model = ACKTR("MlpPolicy", env, nprocs=1, verbose=0, **model_params)
    print("DOING LEARING acer")
    original_env.force_progression = False
    model.learn(int(2e4), seed=seed)
    print("DONE LEARING acer")
    original_env.max_invalid_tries = -1

    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    original_env.force_progression = True
    original_env.invalid_try_limit = 5000
    while n_episodes < 4:
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward

        if done:
            rewards.append(reward_sum)
            reward_sum = 0.0
            n_episodes += 1
            obs = env.reset()

    last_reward = np.mean(rewards)
    trial.report(last_reward)

    return last_reward
예제 #4
0
def stable_baseline_test(env_origin):
    env = make_vec_env(lambda: env_origin, n_envs=1)
    model = ACKTR('CnnPolicy', env_origin, verbose=1)
    model.learn(total_timesteps=2000000)
    print("Stable_baseline evaluation starts.....\n")
    #NOTE:evaluate_policy needs vec_env
    reward_mean, reward_std = evaluate_policy(model,
                                              env,
                                              n_eval_episodes=20,
                                              deterministic=False)

    print("mean reward:" + str(reward_mean) + '\n')
    print("reward std:" + str(reward_std) + '\n')

    print("custom evaluation begin\n")

    env = env_origin
    obs = env.reset()
    reward_list_total = []
    epilen_list = []
    reward_list = []
    last_end = 0
    for i in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        reward_list.append(rewards)
        if dones:
            obs = env.reset()
            epilen_list.append(i - last_end)
            last_end = i
            reward_list_total.append(np.sum(reward_list))
            reward_list = []
            if i > 900:
                break
    print("mean reward:{}\n".format(np.mean(reward_list_total)))
    print("mean epilen:{}\n".format(np.mean(epilen_list)))
예제 #5
0
import gym

from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines import ACKTR

# multiprocess environment
n_cpu = 4
env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)])

model = ACKTR(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save("acktr_cartpole")

del model  # remove to demonstrate saving and loading

model = ACKTR.load("acktr_cartpole")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
    os.makedirs(tensorboard_folder)
if not os.path.isdir(model_folder):
    os.makedirs(model_folder)

policy = ''
model_tag = ''
if len(sys.argv) > 1:
    policy = sys.argv[1]
    model_tag = '_' + sys.argv[1]

env = DummyVecEnv([lambda: BaseEnv(10, 10)])

model = ACKTR(get_policy(policy),
              env,
              verbose=0,
              tensorboard_log=tensorboard_folder)
model.learn(total_timesteps=10000000, tb_log_name='ACKTR_A2C' + model_tag)

model.save(model_folder + "ACKTR_A2C" + model_tag)
del model
model = ACKTR.load(model_folder + "ACKTR_A2C" + model_tag)

done = False
states = None
obs = env.reset()

while not done:
    action, states = model.predict(obs, states)
    obs, _, done, info = env.step(action)
    env.render()
policy = ''
model_tag = ''
if len(sys.argv) > 1:
    policy = sys.argv[1]
    model_tag = '_' + sys.argv[1]

if __name__ == '__main__':
    env = SubprocVecEnv([lambda: ActionMaskEnv() for i in range(4)])
    env = VecFrameStack(env, 3)

    model = ACKTR(get_policy(policy), env, n_steps=100, verbose=0,vf_fisher_coef=0.5 , tensorboard_log=tensorboard_folder, kfac_update=10, n_cpu_tf_sess=2, async_eigen_decomp=False)
    model.learn(total_timesteps=100000000, tb_log_name='ACKTR_A2C' + model_tag)

    model.save(model_folder + "ACKTR_A2C" + model_tag)
    del model
    model = ACKTR.load(model_folder + "ACKTR_A2C" + model_tag)

    done = False
    states = None
    action_masks = []
    obs = env.reset()

    while not done:
        action, states = model.predict(obs, states, action_mask=action_masks)
        obs, _, done, infos = env.step(action)
        env.render()
        action_masks.clear()
        for info in infos:
            env_action_mask = info.get('action_mask')
            action_masks.append(env_action_mask) 
예제 #8
0
    #     # images.append(img)
    #     action, _ = model.predict(obs)
    #     obs, r, done, _ = model.env.step(action)
    #     # print(type(done[0]))
    #     # model.env.render(mode='human')
    #     if done[0]:
    #         print(done[0])
    #         model.env.render(mode='human')
    #     img = model.env.render(mode='rgb_array')
    #     cv2.imshow('image', img)
    #     cv2.waitKey(0)
    # cv2.destroyAllWindows()
    # if i % 20 == 0 :
    #     model.env.render(mode='human')

    # imageio.mimsave('uav_learning.gif', [img for i, img in enumerate(images) if i % 5 == 0], fps=60)
    time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    trajectory_dir = './logs/Experiment_ACKTR_{}/trajectories/'.format(time)
    os.makedirs(trajectory_dir, exist_ok=True)

    print('evaluating runs')
    for i in range(100):
        episode_done = [False]
        while not episode_done[0]:
            action, _ = model.predict(obs)
            obs, r, episode_done, _ = model.env.step(action)
            fig = model.env.render(mode='human')
            if episode_done[0]:
                print(i)
                plt.savefig('{}run_{}_r{}.png'.format(trajectory_dir, i, r[0]))
예제 #9
0
    #env = CustomEnv(3, 6, "tcp://*:5556")
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

    # Create log dir
    log_dir = "Logs/Custom_env/"
    os.makedirs(log_dir, exist_ok=True)
    # Create the callback: check every 1000 steps
    callback = SaveOnBestTrainingRewardCallback(check_freq=500,
                                                log_dir=log_dir)

    #env = Monitor(env, log_dir)

    model = ACKTR(MlpPolicy, env, verbose=2)
    #model.load("DQN_agent")
    model.learn(total_timesteps=20000, callback=callback)
    model.save("temp_agent")

    a = input("Training completed")

    obs = env.reset()
    for _ in range(1000):
        action, _states = model.predict(obs, deterministic=True)
        probs = model.action_probability(obs)
        obs, rewards, dones, info = env.step(action)
        print("Observation:", obs, rewards, probs)

    results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS,
                                 "Lane Manager")
    plt.show()
예제 #10
0
    set_global_seeds(seed)
    return _init

if __name__ == '__main__':
    env_id = "CartPole-v1"
    num_cpu = 4  # Number of processes to use
    # Create the vectorized environment
    #env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])
    #env = gym.make(env_id)
    env = CustomEnv(3, 6, "tcp://*:5556")
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

    # Create log dir
    log_dir = "Logs/env_id/"
    os.makedirs(log_dir, exist_ok=True)
    # Create the callback: check every 1000 steps
    callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)

   # env = Monitor(env, log_dir)

    model = ACKTR(MlpPolicy, env, verbose=2)
    model.load("RL_agent")

    while True:
        user_in = input("Enter States: ").split(',')
        obs = [int(i) for i in user_in]
        print(model.action_probability(obs))
        action = model.predict(obs, deterministic = True)
        print(action)
예제 #11
0
class ACKTR_Agent:
    def __init__(self, params: Params):
        self.params: Params = params
        policy_name = self.params.agent_config['policy']
        self.policy = eval(policy_name)

    def create_model(self, n_envs=1):
        """ Create env and agent model """
        env_cls = SprEnv
        self.env = make_vec_env(env_cls,
                                n_envs=n_envs,
                                env_kwargs={"params": self.params},
                                seed=self.params.seed)
        self.model = ACKTR(
            self.policy,
            self.env,
            gamma=self.params.agent_config['gamma'],
            n_steps=self.params.agent_config['n_steps'],
            ent_coef=self.params.agent_config['ent_coef'],
            vf_coef=self.params.agent_config['vf_coef'],
            vf_fisher_coef=self.params.agent_config['vf_fisher_coef'],
            max_grad_norm=self.params.agent_config['max_grad_norm'],
            learning_rate=self.params.agent_config['learning_rate'],
            gae_lambda=self.params.agent_config['gae_lambda'],
            lr_schedule=self.params.agent_config['lr_schedule'],
            kfac_clip=self.params.agent_config['kfac_clip'],
            kfac_update=self.params.agent_config['kfac_update'],
            async_eigen_decomp=self.params.agent_config['async_eigen_decomp'],
            verbose=self.params.agent_config['verbose'],
            tensorboard_log="./tb/acktr/",
            seed=self.params.seed,
            policy_kwargs={"params": self.params})

    def train(self):
        with ProgressBarManager(self.params.training_duration) as callback:
            self.model.learn(total_timesteps=self.params.training_duration,
                             tb_log_name=self.params.tb_log_name,
                             callback=callback)

    def test(self):
        self.params.test_mode = True
        obs = self.env.reset()
        self.setup_writer()
        episode = 1
        step = 0
        episode_reward = [0.0]
        done = False
        # Test for 1 episode
        while not done:
            action, _states = self.model.predict(obs)
            obs, reward, dones, info = self.env.step(action)
            episode_reward[episode - 1] += reward[0]
            if info[0]['sim_time'] >= self.params.testing_duration:
                done = True
                self.write_reward(episode, episode_reward[episode - 1])
                episode += 1
            sys.stdout.write(
                "\rTesting:" +
                f"Current Simulator Time: {info[0]['sim_time']}. Testing duration: {self.params.testing_duration}"
            )
            sys.stdout.flush()
            step += 1
        print("")

    def save_model(self):
        """ Save the model to a zip archive """
        self.model.save(self.params.model_path)

    def load_model(self, path=None):
        """ Load the model from a zip archive """
        if path is not None:
            self.model = ACKTR.load(path)
        else:
            self.model = ACKTR.load(self.params.model_path)
            # Copy the model to the new directory
            self.model.save(self.params.model_path)

    def setup_writer(self):
        episode_reward_filename = f"{self.params.result_dir}/episode_reward.csv"
        episode_reward_header = ['episode', 'reward']
        self.episode_reward_stream = open(episode_reward_filename,
                                          'a+',
                                          newline='')
        self.episode_reward_writer = csv.writer(self.episode_reward_stream)
        self.episode_reward_writer.writerow(episode_reward_header)

    def write_reward(self, episode, reward):
        self.episode_reward_writer.writerow([episode, reward])