예제 #1
0
def fed_and_eval(base_index, w):
    base_env = make_vec_env(f"selected-bipedal-{subenv_dict[base_index]}-v0",
                            n_envs=1,
                            seed=seed)
    base_agent = ACKTR.load(
        f"./base_agent/{subenv_dict[base_index]}/model.zip")
    base_parameter_dict = base_agent.get_parameters()

    sub_model_parameters = []
    for subenv in subenv_dict.values():
        client_policy = ACKTR.load(
            f"./base{base_index}_client_model/{subenv}/policy.zip")
        sub_model_parameters.append(client_policy.get_parameters())

    aligned_agent = base_agent
    base_parameter_dict = aligned_agent.get_parameters()

    model_align(w, base_parameter_dict, sub_model_parameters, alpha=alpha)

    aligned_agent.load_parameters(base_parameter_dict)
    avg_reward, reward_std = evaluate_policy(aligned_agent,
                                             base_env,
                                             n_eval_episodes=100)

    print(f"base {base_index}, weight {w} done")
    return (avg_reward, reward_std)
예제 #2
0
 def load_model(self, path=None):
     """ Load the model from a zip archive """
     if path is not None:
         self.model = ACKTR.load(path)
     else:
         self.model = ACKTR.load(self.params.model_path)
         # Copy the model to the new directory
         self.model.save(self.params.model_path)
def main():
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    if not USE_LOADED_MODEL:
        model = ACKTR('MlpPolicy', env, verbose=1)

        # Multiprocessed RL Training
        start_time = time.time()
        model.learn(total_timesteps=n_timesteps, log_interval=10)
        total_time_multi = time.time() - start_time

        model.save("cartpole_v1_acktr")

    loaded_model = ACKTR.load("cartpole_v1_acktr")
    loaded_model.set_env(env)

    # Single Process RL Training
    single_process_model = ACKTR('MlpPolicy', env_id, verbose=1)
    start_time = time.time()
    single_process_model.learn(n_timesteps)
    total_time_single = time.time() - start_time

    print("Single-process: {0}s, Multi-process: {1}s".format(
        total_time_single, total_time_multi))

    # create separate clean environment for evaluation
    eval_env = gym.make(env_id)
    mean_reward, std_reward = evaluate_policy(loaded_model,
                                              eval_env,
                                              n_eval_episodes=10)
    print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')
def get_intrinsic_reward(base_index):
    intrinsic_rewards = [[] for _ in range(len(subenv_dict))]
    # base env
    base_name = subenv_dict[base_index]
    base_env = make_vec_env(f"selected-bipedal-{base_name}-v0",
                            n_envs=1,
                            seed=seed)
    base_agent = ACKTR.load(f"./base_agent/{base_name}/model.zip")

    # rnd model
    rnd_dict = {}
    for client_env in subenv_dict.values():
        rnd = RandomNetworkDistillation(input_size=24)
        rnd.load(f"./base{base_index}_client_model/{client_env}/rnd")
        rnd_dict[client_env] = rnd
    obs = base_env.reset()
    for _ in range(num_test):
        for i, client_env in subenv_dict.items():
            intrinsic_rewards[i].append(
                rnd_dict[client_env].get_intrinsic_reward(obs))
        action = base_agent.predict(obs)
        obs, reward, done, info = base_env.step(action[0])
        if done:
            obs = base_env.reset()
    return intrinsic_rewards
예제 #5
0
def NewPotential(current_window, algorithm='PPO'):

    # Determine the pretrained agent
    if algorithm == 'A2C':
        model = A2C.load("pretrained_A2C")
    elif algorithm == 'PPO':
        model = PPO2.load("pretrained_PPO")
    elif algorithm == 'ACKTR':
        model = ACKTR.load("pretrained_ACKTR")
    elif algorithm == 'ACER':
        model = ACER.load("pretrained_ACER")
    else:
        raise ValueError("%s is not a valid algorithm." % algorithm)

    if len(current_window) != model.observation_space.shape[0]:
        raise ValueError("%s is does not match the model's window size." %
                         len(current_window))

    action, _states = model.predict(current_window, deterministic=False)

    voltages = np.linspace(0, 1, num=model.action_space.n)
    if action >= 0 and action <= model.action_space.n - 1:
        voltage = voltages[action]
    else:
        raise ValueError(
            "Received invalid action={} which is not part of the action space".
            format(action))

    return voltage
예제 #6
0
def run_sonobuoy_training(
                    exp_name,exp_path,
                    basicdate,
                    model_type='PPO2',
                    n_eval_episodes=10,
                    training_intervals=100,
                    max_steps=10000,
                    reward_margin=10,
                    log_to_tb=False,
                    pelican_agent_filepath=False):

    # set up logging 
    if log_to_tb:
        writer = SummaryWriter(exp_path)
        tb_log_name = 'sonobuoy_training'
    else:
        writer = None
        tb_log_name = None

        
    env = gym.make('plark-env-v0', panther_agent_filepath='/data/agents/models/PPO2_20200429_073132_panther/')
    
    if pelican_agent_filepath:
        logger.info('Loading agent from file: ' + pelican_agent_filepath)

        if model_type.lower() == 'dqn':
            model = DQN.load(pelican_agent_filepath)
            model.set_env(env)
            
        elif model_type.lower() == 'ppo2':
            model = PPO2.load(pelican_agent_filepath)
            model.set_env(DummyVecEnv([lambda: env]))
            
        elif model_type.lower() == 'a2c':
            model = A2C.load(pelican_agent_filepath)
            model.set_env(env)
            
        elif model_type.lower() == 'acktr':
            model = ACKTR.load(pelican_agent_filepath)
            model.set_env(env)

    else:   
        # Instantiate the env and model
        model = PPO2('CnnPolicy', env)

    # Start training 
    train_agent(exp_path,model,env,training_intervals,max_steps,model_type,basicdate,writer,tb_log_name,reward_margin)
                
    # Evaluate
    mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=n_eval_episodes, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False)
    logger.info('Evaluation finished')
    logger.info('Mean Reward is ' + str(mean_reward))
    logger.info('Number of steps is ' + str(n_steps))
예제 #7
0
def run_illegal_move_training(
                    exp_name,exp_path,
                    basicdate,
                    model_type='PPO2',
                    n_eval_episodes=10,
                    training_intervals=100,
                    max_steps=10000,
                    reward_margin=10,
                    log_to_tb=False,
                    pelican_agent_filepath=False):
    
       # set up logging 
    if log_to_tb:
        writer = SummaryWriter(exp_path)
        tb_log_name = 'Illegal_move_prevention_training'
    else:
        writer = None
        tb_log_name = None
    
    if pelican_agent_filepath:
        logger.info('Loading agent from file: ' + pelican_agent_filepath)
        # env = plark_env_illegal_move.PlarkEnvIllegalMove( config_file_path='/Components/plark-game/plark_game/game_config/10x10/balanced.json')
        env = gym.make('plark-env-illegal-move-v0')

        if model_type.lower() == 'dqn':
            model = DQN.load(pelican_agent_filepath)
            model.set_env(env)
            
        elif model_type.lower() == 'ppo2':
            model = PPO2.load(pelican_agent_filepath)
            model.set_env(DummyVecEnv([lambda: env]))
            
        elif model_type.lower() == 'a2c':
            model = A2C.load(pelican_agent_filepath)
            model.set_env(env)
            
        elif model_type.lower() == 'acktr':
            model = ACKTR.load(pelican_agent_filepath)
            model.set_env(env)

    else:   
        # Instantiate the env and model
        env = gym.make('plark-env-illegal-move-v0')
        model = PPO2('CnnPolicy', env)

    # Start training 
    train_agent(exp_path,model,env,training_intervals,max_steps,model_type,basicdate,writer,tb_log_name,reward_margin)
                
    # Evaluate
    mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=n_eval_episodes, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False)
    logger.info('Evaluation finished')
    logger.info('Mean Reward is ' + str(mean_reward))
    logger.info('Number of steps is ' + str(n_steps))
예제 #8
0
 def loadAgent(self, filepath, algorithm_type):
     try:
         if algorithm_type.lower() == 'dqn':
             self.model = DQN.load(filepath)
         elif algorithm_type.lower() == 'ppo2':
             self.model = PPO2.load(filepath)
         elif algorithm_type.lower() == 'a2c':
             self.model = A2C.load(filepath)
         elif algorithm_type.lower() == 'acktr':
             self.model = ACKTR.load(filepath)
     except:
         raise ValueError('Error loading pelican agent. File : "' +
                          filepath + '" does not exsist')
 def loadAgent(self, filepath, algorithm_type):
     try:
         if algorithm_type.lower() == "dqn":
             self.model = DQN.load(filepath)
         elif algorithm_type.lower() == "ppo2":
             self.model = PPO2.load(filepath)
         elif algorithm_type.lower() == "ppo":
             self.model = PPO.load(filepath)
         elif algorithm_type.lower() == "a2c":
             self.model = A2C.load(filepath)
         elif algorithm_type.lower() == "acktr":
             self.model = ACKTR.load(filepath)
     except:
         raise ValueError('Error loading panther agent. File : "' +
                          filepath + '" does not exsist')
def save_client(base_index, subenv_id):
    base_agent = ACKTR.load(
        f"./base_agent/{subenv_dict[base_index]}/model.zip")

    subenv = subenv_dict[subenv_id]
    env = make_vec_env(f"selected-bipedal-{subenv}-v0",
                       n_envs=n_envs,
                       seed=seed)
    learner = base_agent
    learner.env = env
    learner.verbose = 0
    callback = SaveRNDDatasetCallback(base_index=base_index)
    learner.learn(
        total_timesteps=client_timesteps,
        callback=callback,
    )

    dir_name = f"base{base_index}_client_model/{subenv}"
    Path(dir_name).mkdir(parents=True, exist_ok=True)
    learner.save(f"{dir_name}/policy.zip")
    print(f"base {base_index} sub-env {subenv} done")
def eval_base_agent(agent_index):
    mean_result = []
    std_result = []
    agent = ACKTR.load(f"./base_agent/{subenv_dict[agent_index]}/model.zip")
    for env_index in range(4):
        env = gym.make(f"selected-bipedal-{subenv_dict[env_index]}-v0")
        env.seed = seed
        mean, std = evaluate_policy(agent, env, n_eval_episodes=100)
        mean_result.append(mean)
        std_result.append(std)
    Path("log").mkdir(parents=True, exist_ok=True)
    file = open(f"log/agent{agent_index}_simple_agent_test.csv",
                "w",
                newline="")
    writer = csv.writer(file)
    writer.writerow(mean_result)
    writer.writerow(std_result)
    file.close()
    print(f">>> Agent {agent_index}:")
    print(mean_result)
    print(std_result)
    return
예제 #12
0
def train_acktr(seed):
    """
    test ACKTR on the uav_env(cartesian,discrete) 
    """
    """
    ACKTR(policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25, 
    vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, 
    lr_schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, 
    async_eigen_decomp=False)
    """
    algo = 'ACKTR'
    num_timesteps = 3000000

    env = set_up_env(seed)

    global best_mean_reward, n_steps
    best_mean_reward, n_steps = -np.inf, 0

    model = ACKTR(policy=MlpPolicy, env=env, gamma=0.99, nprocs=1, n_steps=20,
                  ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25,
                  max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0,
                  tensorboard_log="./logs/{}/tensorboard/{}/".format(EXPERIMENT_NATURE, algo),
                  _init_setup_model=True)
    # , async_eigen_decomp=False)

    model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed,
                log_interval=500, tb_log_name="seed_{}".format(seed))

    model = ACKTR.load(log_dir + 'best_model.pkl')

    evaluation = evaluate_model(env, model, 100)
    os.makedirs('./logs/{}/csv/{}/'.format(EXPERIMENT_NATURE, algo), exist_ok=True)
    os.rename('/tmp/gym/monitor.csv', "./logs/{}/csv/{}/seed_{}.csv".format(EXPERIMENT_NATURE, algo, seed))
    env.close()
    del model, env
    gc.collect()
    return evaluation
예제 #13
0
#! /usr/bin/env python

import gym
gym.logger.set_level(40)
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

from env import GoLeftEnv
from stable_baselines import DQN, PPO2, A2C, ACKTR
from stable_baselines.common.cmd_util import make_vec_env
from stable_baselines.common.evaluation import evaluate_policy

env = GoLeftEnv(grid_size=10)
env = make_vec_env(lambda: env, n_envs=1)

model = ACKTR.load("models/acktr_goleft", env=env)

obs = env.reset()
n_steps = 20
for step in range(n_steps):
    action, _ = model.predict(obs, deterministic=True)
    print("Step {}".format(step + 1))
    print("Action: ", action)
    obs, reward, done, info = env.step(action)
    print('obs=', obs, 'reward=', reward, 'done=', done)
    if done:
        # Note that the VecEnv resets automatically
        # when a done signal is encountered
        print("Goal reached!", "reward=", reward)
        break
예제 #14
0
     train(model, env, out_dir)
 else:
     #results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "rl")
     path = '{}/best_model.zip'.format(args.eval)
     env = CarEnv(args.eval, cam_idx_list=(0, 3, 4))
     env.next_weather()
     #env = Monitor(env, args.eval)
     #print(env.num_envs)
     if args.model == 'trpo':
         model = TRPO.load(path)
     elif args.model == 'acer':
         model = ACER.load(path)
     elif args.model == 'ppo':
         model = PPO2.load(path)
     elif args.model == 'acktr':
         model = ACKTR.load(path)
     elif args.model == 'ddpg':
         model = DDPG.load(path)
     elif args.model == 'a2c':
         model = A2C.load(path)
     elif args.model == 'sac':
         model = SAC.load(path)
     #mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5,return_episode_rewards=True)
     #eps_rewards, eps_len = evaluate_policy(model, env, n_eval_episodes=5,return_episode_rewards=True)
     # print(eps_rewards)
     # print(eps_len)
     # print(np.mean(eps_rewards))
     #print("Mean reward = {}","Std reward = {}".format(np.mean(eps),std_reward))
     rs = evaluate(model, env)
     with open("{}/result.txt".format(args.eval), 'w') as f:
         for item in rs:
예제 #15
0
envTmp = gym.make('Battleships-v0', config=config)

#Wrap environment into a vector environment
env = DummyVecEnv([lambda: envTmp])

# Choose to display board
print("Diplay board: Yes (1), No (0)")
choiceRender = bool(int(input()))

# Choose Model
randomAgent = True
print("Choose Agent: Radom (1), ACKTR (2), DQN (3)")
choice = int(input())
if choice == 2:
    # Load ACKTR Model
    model = ACKTR.load("./ACKTR_Models/ACKTR_5x5_3_2_2_Dynamic.zip", verbose=0, env=env)
    # Disable Random Agent
    randomAgent = False

elif choice == 3:
    # load DQN Model
    model = DQN.load("./DQN_Models/DQN_5x5_3_2_2_Dynamic.zip", verbose=0, env=env)
    # Disable Random Agent
    randomAgent = False

# Inits result Array
results = []
# Iteration: Amount of played Games
for iteration in range(10):
    score = 0
    print('Iteration', iteration)
예제 #16
0
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)
import tensorflow as tf
tf.get_logger().setLevel('INFO')
tf.autograph.set_verbosity(0)
import logging
tf.get_logger().setLevel(logging.ERROR)

import env_yaw
import gym
from stable_baselines import DQN, PPO2, A2C, ACKTR
from stable_baselines.common.cmd_util import make_vec_env
from stable_baselines.common.evaluation import evaluate_policy

env = gym.make("Yaw-v0")
env = make_vec_env(lambda: env, n_envs=1)
model = ACKTR.load("models/acktr_yaw", env=env)

obs = env.reset()
n_steps = 20
for step in range(n_steps):
    action, _ = model.predict(obs, deterministic=True)
    print("Step {}".format(step + 1))
    print("Action: ", action)
    obs, reward, done, info = env.step(action)
    print('obs=', obs, 'reward=', reward, 'done=', done)
    if done:
        print("Goal reached!", "reward=", reward)
        break
예제 #17
0
        pass

    def close(self):
        pass


env = AItest()
env = Monitor(env, filename=None, allow_early_resets=True)
env = DummyVecEnv([lambda: env])
#Train the agent and save
#model = ACKTR('MlpPolicy', env, verbose=1).learn(50000)
#print ("learning done")
#model.save('Macke_AI')
#print ("save done")
# run with saved agent
model = ACKTR.load('Macke_AI')

## Test the trained agent
#obs = env.reset()
#n_steps = 100
#for step in range(n_steps):
#  action, _ = model.predict(obs, deterministic=True)
#  #print("Step {}".format(step + 1))
#  #print("Action: ", action)
#  obs, reward, done, info = env.step(action)
#  #print('obs=', obs, 'reward=', reward, 'done=', done)
#  env.render()
#  if done:
#    # Note that the VecEnv resets automatically
#    # when a done signal is encountered
#    print("Goal rreached!", "reward=", reward)
예제 #18
0
        param_list=config['param_list']), config['wrapper_args'])
else:
    env = gym.make('jackal_navigation-v0',
                   gui=gui,
                   VLP16=config['VLP16'],
                   world_name=config['world_name'],
                   init_position=config['init_position'],
                   goal_position=config['goal_position'],
                   max_step=config['max_step'],
                   time_step=config['time_step'],
                   param_delta=config['param_delta'],
                   param_init=config['param_init'],
                   param_list=config['param_list'])

if config['algorithm'] == 'ACKTR':
    model = ACKTR.load(model_path)
elif config['algorithm'] == 'PPO2':
    model = PPO2.load(model_path)
elif config['algorithm'] == 'DQN':
    model = DQN.load(model_path)

range_dict = {
    'max_vel_x': [0.1, 2],
    'max_vel_theta': [0.314, 3.14],
    'vx_samples': [1, 12],
    'vtheta_samples': [1, 40],
    'path_distance_bias': [0.1, 1.5],
    'goal_distance_bias': [0.1, 2]
}

rs = []
예제 #19
0
            # New best model, you could save the agent here
            if mean_reward > best_mean_reward:
                best_mean_reward = mean_reward
                # Example for saving best model
                print("Saving new best model")
                _locals['self'].save(model_file)
    n_steps += 1
    # Returning False will stop training early
    return True


env = Monitor(env, log_dir, allow_early_resets=True)
env = DummyVecEnv([lambda: env])
if os.path.isfile(model_file):
    model = ACKTR.load(model_file, env=env)
else:
    model = ACKTR(
        MlpLnLstmPolicy,
        env,
        tensorboard_log=f"./test{base_test_file}/",
        verbose=0
    )  # add tensorboard_log="./test/" and run tensorboard --logdir /Users/constantin/Documents/bn/rl/test/PPO2_1
model.learn(total_timesteps=10**5, callback=callback)

# def evaluate(model, num_steps=1000):
#     obs = env.reset()
#     for i in range(num_steps):
#         # _states are only useful when using LSTM policies
#         action, _states = model.predict(obs)
#
예제 #20
0
        #     shutil.rmtree(checkpoint_name)

        tf.saved_model.simple_save(
            model.sess,
            checkpoint_name,
            inputs={"obs": model.act_model.obs_ph},
            outputs={"action": model.act_model._deterministic_action})


if __name__ == '__main__':
    if os.path.isdir(file):
        shutil.rmtree(file)
    if args.algo == 'ppo':
        model = PPO2.load(file)
    elif args.algo == 'acktr':
        model = ACKTR.load(file)

    # generate_checkpoint_from_model(model, file)
    # converter = tf.lite.TFLiteConverter.from_saved_model(file)
    # tflite_model = converter.convert()
    # open(file + "/converted_model.tflite", "wb").write(tflite_model)

    # multiprocess environment
    n_cpu = 1
    env = SubprocVecEnv(
        [lambda: gym.make('PendulumA-v0', renders=True) for i in range(n_cpu)])
    obs = env.reset()
    # When using VecEnv, done is a vector
    done = [False for _ in range(env.num_envs)]

    while True:
    model = ACKTR(get_policy(policy),
                  env,
                  n_steps=100,
                  verbose=0,
                  gae_lambda=0.95,
                  vf_fisher_coef=0.5,
                  tensorboard_log=tensorboard_folder,
                  kfac_update=10,
                  n_cpu_tf_sess=2,
                  async_eigen_decomp=False)
    model.learn(total_timesteps=100000000,
                tb_log_name='ACKTR_PPO2' + model_tag)

    model.save(model_folder + "ACKTR_PPO2" + model_tag)
    del model
    model = ACKTR.load(model_folder + "ACKTR_PPO2" + model_tag)

    done = False
    states = None
    action_masks = []
    obs = env.reset()

    while not done:
        action, states = model.predict(obs, states, action_mask=action_masks)
        obs, _, done, infos = env.step(action)
        env.render()
        action_masks.clear()
        for info in infos:
            env_action_mask = info.get('action_mask')
            action_masks.append(env_action_mask)
        # model.cliprange = stable_baselines.common.schedules.LinearSchedule(1.0, 0.2, initial_p=0).value
        model.learn(total_timesteps=1000000,
                    reset_num_timesteps=False,
                    callback=callback)
        model.save(log_dir + 'model_PPO_' + str(id + 1))

    if args.algo == "acktr":
        id = balboa.utils.tensorboard_latest_directory_number(
            log_dir, 'ACKTR_')
        print('Using acktr')
        if args.load_id == None:
            # tensorboard_log=log_dir
            model = ACKTR("MlpPolicy",
                          env,
                          policy_kwargs=policy_kwargs,
                          ent_coef=0.0,
                          verbose=1)
            # verbose=1, n_steps=48, learning_rate=0.1, lr_schedule='constant',
        else:
            print("Loading model: " + str(args.load_id))
            model = ACKTR.load(log_dir + 'ACKTR_' + str(args.load_id) + ".zip",
                               env=env)
        model.tensorboard_log = log_dir
        # model.learning_rate = stable_baselines.common.schedules.LinearSchedule(1.0, 0.06, initial_p=0.06).value
        # model.cliprange = stable_baselines.common.schedules.LinearSchedule(1.0, 0.2, initial_p=0).value

        model.learn(total_timesteps=3000000,
                    reset_num_timesteps=False,
                    callback=callback)
        print("Saving to: " + log_dir + 'ACKTR_' + str(id + 1))
        model.save(log_dir + 'model_ACKTR_' + str(id + 1))
예제 #23
0
    global best_mean_reward, n_steps
    best_mean_reward, n_steps = -np.inf, 0
    """
    ACKTR(policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25,
     vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001,
      lr_schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True,
       async_eigen_decomp=False, policy_kwargs=None, full_tensorboard_log=False)
    """

    # model = ACKTR(policy=MlpPolicy, env=env, gamma=0.99, nprocs=1, n_steps=20,
    #               ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25,
    #               max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0,
    #               tensorboard_log=None, _init_setup_model=True)

    model = ACKTR.load(
        '/home/daniel/Desktop/Experiment_CurriculumLearning_2019-04-12_13-06-45/curriculum/curriculum/curriculum_seed_0/models/model_3000000_steps.pkl'
    )
    model.set_env(env)

    model.learn(total_timesteps=num_timesteps,
                callback=callback,
                seed=seed,
                log_interval=100)

    images = []
    obs = model.env.reset()
    # img = model.env.render(mode='rgb_array')
    model.env.render(mode='human')

    # for i in range(30000):
    #     # images.append(img)
예제 #24
0
from stable_baselines.common.policies import MlpPolicy
#from stable_baselines.common.policies import LnMlpPolicy
#from stable_baselines import PPO2
from stable_baselines import ACKTR
import os
from callback import SaveOnBestTrainingRewardCallback
from stable_baselines.bench import Monitor
from stable_baselines.common import make_vec_env

# Gym Environment 호출
env = Manipulator2D()

load_model_path = "tmp9/acktr_16110000.zip"
#load_model_path = "ppo2-mani7.zip"
#저장된 학습 파일로부터 weight 등을 로드
model = ACKTR.load(load_model_path)

# 시뮬레이션 환경을 초기화
obs = env.reset()

points = 0
total_time = 0

while (total_time <= 120):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    if dones:

        total_time += env.t

        # env.buffer = env.buffer_csv
예제 #25
0
    env6 = DummyVecEnv([lambda: env6])
    state[6] = env6.reset()
    env7 = gym.make(env_id[7])
    env7 = DummyVecEnv([lambda: env7])
    state[7] = env7.reset()
    env8 = gym.make(env_id[8])
    env8 = DummyVecEnv([lambda: env8])
    state[8] = env8.reset()
    env9 = gym.make(env_id[9])
    env9 = DummyVecEnv([lambda: env9])
    state[9] = env9.reset()

    print('Environment Created')
    model_name = 'ACKTR_MlpLSTM_' + group + '_' + args.reward
    MODEL_PATH = 'Saved_models'
    tr_model = ACKTR.load(MODEL_PATH + '/' + model_name)

    t = 480  ## number of time steps to evaluate. t = 480 is 1 day
    all_state = np.zeros((10, t))

    print('Simulation Started ... ...')
    for i in range(t):
        aa, _ = tr_model.predict(state)
        # print(aa)
        action = Action(basal=aa[0] / 6000, bolus=0)
        state[0], reward, done, _ = env0.step(action)
        action = Action(basal=aa[1] / 6000, bolus=0)
        state[1], reward, done, _ = env1.step(action)
        action = Action(basal=aa[2] / 6000, bolus=0)
        state[2], reward, done, _ = env2.step(action)
        action = Action(basal=aa[3] / 6000, bolus=0)
예제 #26
0
from helperFun import grav_options
import sys
import pprint

if __name__ == "__main__":
    try:
        if len(sys.argv) < 3:
            raise StringException("Usage: training_logs/<path to agent model> <grav-option>")
        # env = myPandaFreeSpaceTraj(has_renderer=True)
        run_name = sys.argv[1]
        grav_option = sys.argv[2]
        if grav_option == "ee_PD_cont":
            env = myPandaIKWrapper3D(has_renderer=True)
        else:
            env = myPandaFreeSpace1Goal(has_renderer=True, grav_option=grav_options[grav_option])
        model = ACKTR.load("training_logs/" + run_name)


        # mean_reward, n_steps = evaluate_policy(model, env, 10)
        # print("avg reward:{}\nnumber of steps:{}".format(mean_reward, n_steps))
        ## Play Agent
        done = False
        obs = env.reset()
        cum_reward = 0
        action_band = 10
        count = 0
        pp = pprint.PrettyPrinter()
        while True:
            if done:
                print("Reward:", cum_reward)
                cum_reward = 0
예제 #27
0
import gym

from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines import ACKTR

# multiprocess environment
n_cpu = 4
env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)])

model = ACKTR(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save("acktr_cartpole")

del model  # remove to demonstrate saving and loading

model = ACKTR.load("acktr_cartpole")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
    os.makedirs(tensorboard_folder)
if not os.path.isdir(model_folder):
    os.makedirs(model_folder)

policy = ''
model_tag = ''
if len(sys.argv) > 1:
    policy = sys.argv[1]
    model_tag = '_' + sys.argv[1]

env = DummyVecEnv([lambda: BaseEnv(10, 10)])

model = ACKTR(get_policy(policy),
              env,
              verbose=0,
              tensorboard_log=tensorboard_folder)
model.learn(total_timesteps=10000000, tb_log_name='ACKTR_A2C' + model_tag)

model.save(model_folder + "ACKTR_A2C" + model_tag)
del model
model = ACKTR.load(model_folder + "ACKTR_A2C" + model_tag)

done = False
states = None
obs = env.reset()

while not done:
    action, states = model.predict(obs, states)
    obs, _, done, info = env.step(action)
    env.render()
예제 #29
0
    set_global_seeds(seed)
    return _init

if __name__ == '__main__':
    env_id = "CartPole-v1"
    num_cpu = 4  # Number of processes to use
    # Create the vectorized environment
    #env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])
    #env = gym.make(env_id)
    env = CustomEnv(3, 6, "tcp://*:5556")
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

    # Create log dir
    log_dir = "Logs/env_id/"
    os.makedirs(log_dir, exist_ok=True)
    # Create the callback: check every 1000 steps
    callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)

   # env = Monitor(env, log_dir)

    model = ACKTR(MlpPolicy, env, verbose=2)
    model.load("RL_agent")

    while True:
        user_in = input("Enter States: ").split(',')
        obs = [int(i) for i in user_in]
        print(model.action_probability(obs))
        action = model.predict(obs, deterministic = True)
        print(action)
예제 #30
0
# Callback safes the currently best model
eval_callback = EvalCallback(env4,
                             callback_on_new_best=callback_on_best,
                             verbose=1,
                             best_model_save_path='./ACKTR_Models/best/')
checkpoint_callback = CheckpointCallback(save_freq=1e4,
                                         save_path='./model_checkpoints/')

# Uncomment, to train a new fresh model, otherwise a allready trained model will be trained
# If a frehs model is trained, it should be trained with binary reward (Config) first, to reduce multiple
# shots onto the same field.
#model = ACKTR(MlpPolicy, env, verbose=2, tensorboard_log="./logs/progress_tensorboard/",  n_cpu_tf_sess=4)

# Load current best model
model = ACKTR.load("./ACKTR_Models/best/best_model.zip",
                   verbose=2,
                   env=env,
                   tensorboard_log="./logs/progress_tensorboard/")

# Train model
model.learn(1000000, callback=[checkpoint_callback, eval_callback])

# Delete current model and load the best model
del model
model = ACKTR.load("./ACKTR_Models/best/best_model.zip",
                   verbose=2,
                   env=env,
                   tensorboard_log="./logs/progress_tensorboard/")

# Test trained model
results = []
for iteration in range(100):