Пример #1
0
log_path = "{}/{}/".format(args.log_folder, args.algo)

if exp_name:
    assert (not ('_' in exp_name)), 'experiment name should not include _'
    save_path = os.path.join(
        log_path,
        "{}_{}_{}".format(env_id, exp_name,
                          get_latest_run_id(log_path, env_id, exp_name) + 1))
else:
    save_path = os.path.join(
        log_path, "{}_{}".format(env_id,
                                 get_latest_run_id(log_path, env_id) + 1))

if args.log_outputs:
    # Log the outputs
    logger.configure(folder=save_path, format_strs=['log'])

params_path = "{}/{}".format(save_path, env_id)
os.makedirs(params_path, exist_ok=True)
tensorboard_log = None if args.no_tensorboard else save_path
monitor_log = None if args.no_monitor else save_path

is_atari = 'NoFrameskip' in env_id

print("=" * 10, env_id, "=" * 10)

# Load hyperparameters from yaml file
with open('hyperparams/{}.yml'.format(args.algo), 'r') as f:
    hyperparams_dict = yaml.load(f)
    if is_atari:
        hyperparams = hyperparams_dict['atari']
Пример #2
0
def main(args):

    rank = MPI.COMM_WORLD.Get_rank()

    model_dir = os.path.join(config.MODELDIR, args.env_name)

    if rank == 0:
        try:
            os.makedirs(model_dir)
        except:
            pass
        if args.reset:
            reset_files(model_dir)
        logger.configure(config.LOGDIR)
    else:
        logger.configure(format_strs=[])

    if args.debug:
        logger.set_level(config.DEBUG)
    else:
        time.sleep(5)
        logger.set_level(config.INFO)

    workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)

    logger.info('\nSetting up the selfplay training environment opponents...')
    base_env = get_environment(args.env_name)
    env = selfplay_wrapper(base_env)(opponent_type=args.opponent_type,
                                     verbose=args.verbose)
    env.seed(workerseed)

    CustomPolicy = get_network_arch(args.env_name)

    params = {
        'gamma': args.gamma,
        'timesteps_per_actorbatch': args.timesteps_per_actorbatch,
        'clip_param': args.clip_param,
        'entcoeff': args.entcoeff,
        'optim_epochs': args.optim_epochs,
        'optim_stepsize': args.optim_stepsize,
        'optim_batchsize': args.optim_batchsize,
        'lam': args.lam,
        'adam_epsilon': args.adam_epsilon,
        'schedule': 'linear',
        'verbose': 1,
        'tensorboard_log': config.LOGDIR
    }

    time.sleep(
        5
    )  # allow time for the base model to be saved out when the environment is created

    if args.reset or not os.path.exists(
            os.path.join(model_dir, 'best_model.zip')):
        logger.info('\nLoading the base PPO agent to train...')
        model = PPO1.load(os.path.join(model_dir, 'base.zip'), env, **params)
    else:
        logger.info(
            '\nLoading the best_model.zip PPO agent to continue training...')
        model = PPO1.load(os.path.join(model_dir, 'best_model.zip'), env,
                          **params)

    #Callbacks
    logger.info(
        '\nSetting up the selfplay evaluation environment opponents...')
    callback_args = {
        'eval_env':
        selfplay_wrapper(base_env)(opponent_type=args.opponent_type,
                                   verbose=args.verbose),
        'best_model_save_path':
        config.TMPMODELDIR,
        'log_path':
        config.LOGDIR,
        'eval_freq':
        args.eval_freq,
        'n_eval_episodes':
        args.n_eval_episodes,
        'deterministic':
        False,
        'render':
        True,
        'verbose':
        0
    }

    if args.rules:
        logger.info(
            '\nSetting up the evaluation environment against the rules-based agent...'
        )
        # Evaluate against a 'rules' agent as well
        eval_actual_callback = EvalCallback(
            eval_env=selfplay_wrapper(base_env)(opponent_type='rules',
                                                verbose=args.verbose),
            eval_freq=1,
            n_eval_episodes=args.n_eval_episodes,
            deterministic=args.best,
            render=True,
            verbose=0)
        callback_args['callback_on_new_best'] = eval_actual_callback

    # Evaluate the agent against previous versions
    eval_callback = SelfPlayCallback(args.opponent_type, args.threshold,
                                     args.env_name, **callback_args)

    logger.info('\nSetup complete - commencing learning...\n')

    model.learn(total_timesteps=int(1e9),
                callback=[eval_callback],
                reset_num_timesteps=False,
                tb_log_name="tb")

    env.close()
    del env
    "Cornwall", "Plymouth", "Torbay", "East Devon", "Exeter", "Mid Devon",
    "North Devon", "South Hams", "Teignbridge", "Torridge", "West Devon"
]

env = make("SEIRmulti-v0")

districts_group_ids = [
    env.unwrapped.district_idx(name) for name in districts_group
]
env = NormalizedObservationWrapper(env)
env = NormalizedRewardWrapper(env)
env = MultiAgentSelectObservation(env, districts_group_ids)
env = MultiAgentSelectAction(env, districts_group_ids, 1)
env = MultiAgentSelectReward(env, districts_group_ids)

logger.configure(folder=args.monitor_path, format_strs=["csv"])

env = DummyVecEnv([lambda: env])

print(f"tensorboard --logdir {args.monitor_path}")

layers = [args.n_hidden_units] * args.n_hidden_layers

model = PPO2(MlpPolicy,
             env,
             verbose=0,
             tensorboard_log=args.monitor_path,
             ent_coef=args.entropy_coef,
             learning_rate=args.learning_rate,
             noptepochs=args.n_epochs,
             nminibatches=args.n_minibatches,
def run_experiment_with_trained(augment_num_timesteps, linear_co_threshold, augment_seed, augment_run_num, network_size,
                                policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, learning_rate,
                                additional_note, result_dir, keys_to_include, metric_param, linear_top_vars_list=None,
                                linear_correlation_neuron_list=None, visualize=False, lagrangian_inds_to_include=None,
                                neurons_inds_to_include=None, use_lagrangian=True):
    trained_model = None
    if not use_lagrangian:
        with tf.variable_scope("trained_model"):
            common_arg_parser = get_common_parser()
            trained_args, cma_unknown_args = common_arg_parser.parse_known_args()
            trained_args.env = policy_env
            trained_args.seed = policy_seed
            trained_args.num_timesteps = policy_num_timesteps
            trained_args.run_num = policy_run_num
            trained_this_run_dir = get_dir_path_for_this_run(trained_args)
            trained_traj_params_dir_name = get_full_params_dir(trained_this_run_dir)
            trained_save_dir = get_save_dir(trained_this_run_dir)

            trained_final_file = get_full_param_traj_file_path(trained_traj_params_dir_name, "pi_final")
            trained_final_params = pd.read_csv(trained_final_file, header=None).values[0]

            trained_model = PPO2.load(f"{trained_save_dir}/ppo2", seed=augment_seed)
            trained_model.set_pi_from_flat(trained_final_params)

    args = AttributeDict()

    args.normalize = True
    args.num_timesteps = augment_num_timesteps
    args.run_num = augment_run_num
    args.alg = "ppo2"
    args.seed = augment_seed

    logger.log(f"#######TRAIN: {args}")
    # non_linear_global_dict
    timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S')
    experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \
                       f"_top_num_to_include{linear_co_threshold.start}_{linear_co_threshold.stop}" \
                       f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \
                       f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \
                       f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}"

    if policy_env == "DartWalker2d-v1":
        entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input'
    elif policy_env == "DartHopper-v1":
        entry_point = 'gym.envs.dart:DartHopperEnv_aug_input'
    elif policy_env == "DartHalfCheetah-v1":
        entry_point = 'gym.envs.dart:DartHalfCheetahEnv_aug_input'
    elif policy_env == "DartSnake7Link-v1":
        entry_point = 'gym.envs.dart:DartSnake7LinkEnv_aug_input'
    else:
        raise NotImplemented()


    this_run_dir = get_experiment_path_for_this_run(entry_point, args.num_timesteps, args.run_num,
                                                    args.seed, learning_rate=learning_rate, top_num_to_include=linear_co_threshold,
                                                    result_dir=result_dir, network_size=network_size)
    full_param_traj_dir_path = get_full_params_dir(this_run_dir)
    log_dir = get_log_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)


    create_dir_remove(this_run_dir)
    create_dir_remove(full_param_traj_dir_path)
    create_dir_remove(save_dir)
    create_dir_remove(log_dir)
    logger.configure(log_dir)

    linear_top_vars_list_wanted_to_print = []
    if (use_lagrangian and lagrangian_inds_to_include is None) or (not use_lagrangian and neurons_inds_to_include is None):
        # note this is only linear
        if linear_top_vars_list is None or linear_correlation_neuron_list is None:

            linear_top_vars_list, linear_correlation_neuron_list = read_linear_top_var(policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed,
                                               eval_run_num, additional_note, metric_param=metric_param)

        lagrangian_inds_to_include, neurons_inds_to_include, linear_top_vars_list_wanted_to_print = \
            get_wanted_lagrangians_and_neurons(keys_to_include, linear_top_vars_list, linear_correlation_neuron_list, linear_co_threshold)



    with open(f"{log_dir}/lagrangian_inds_to_include.json", 'w') as fp:
        json.dump(lagrangian_inds_to_include, fp)
    with open(f"{log_dir}/linear_top_vars_list_wanted_to_print.json", 'w') as fp:
        json.dump(linear_top_vars_list_wanted_to_print, fp)
    with open(f"{log_dir}/neurons_inds_to_include.json", 'w') as fp:
        json.dump(neurons_inds_to_include, fp)


    args.env = f'{experiment_label}_{entry_point}-v1'

    if not use_lagrangian:
        register(
            id=args.env,
            entry_point=entry_point,
            max_episode_steps=1000,
            kwargs={"lagrangian_inds_to_include": None, "trained_model": trained_model,
                    "neurons_inds_to_include": neurons_inds_to_include}
        )
    else:
        register(
            id=args.env,
            entry_point=entry_point,
            max_episode_steps=1000,
            kwargs={"lagrangian_inds_to_include": lagrangian_inds_to_include, "trained_model": None,
                    "neurons_inds_to_include": None}
        )

    def make_env():
        env_out = gym.make(args.env)
        env_out.env.visualize = visualize
        env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    walker_env = env.envs[0].env.env
    walker_env.disableViewer = not visualize


    if args.normalize:
        env = VecNormalize(env)
    policy = MlpPolicy



    set_global_seeds(args.seed)
    walker_env.seed(args.seed)

    num_dof = walker_env.robot_skeleton.ndofs
    show_M_matrix(num_dof, lagrangian_inds_to_include, linear_co_threshold, log_dir)




    # extra run info I added for my purposes
    run_info = {"run_num": args.run_num,
                "env_id": args.env,
                "full_param_traj_dir_path": full_param_traj_dir_path}

    layers = [network_size, network_size]
    policy_kwargs = {"net_arch" : [dict(vf=layers, pi=layers)]}
    model = PPO2(policy=policy, env=env, n_steps=4096, nminibatches=64, lam=0.95, gamma=0.99,
                 noptepochs=10,
                 ent_coef=0.0, learning_rate=learning_rate, cliprange=0.2, optimizer='adam', policy_kwargs=policy_kwargs,
                 seed=args.seed)
    model.tell_run_info(run_info)
    model.learn(total_timesteps=args.num_timesteps, seed=args.seed)

    model.save(f"{save_dir}/ppo2")

    if args.normalize:
        env.save_running_average(save_dir)

    return log_dir
Пример #5
0
from stable_baselines.common.base_class import _UnvecWrapper

from utils import make_env, ALGOS, linear_schedule, get_latest_run_id, get_wrapper_class
from utils.hyperparams_opt import hyperparam_optimization
from utils.callbacks import SaveVecNormalizeCallback
from utils.noise import LinearNormalActionNoise
from utils.utils import StoreDict

from stable_baselines.logger import configure
from utils.callbacks import GoalToleranceCallback

# Export the following environment variables for csv files
# export OPENAI_LOG_FORMAT='stdout,log,csv'
# export OPENAI_LOGDIR="log_dir"

configure()

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default="CartPole-v1", help='environment ID')
    parser.add_argument('-tb', '--tensorboard-log', help='Tensorboard log dir', default='', type=str)
    parser.add_argument('-i', '--trained-agent', help='Path to a pretrained agent to continue training',
                        default='', type=str)
    parser.add_argument('--algo', help='RL Algorithm', default='ppo2',
                        type=str, required=False, choices=list(ALGOS.keys()))
    parser.add_argument('-n', '--n-timesteps', help='Overwrite the number of timesteps', default=-1,
                        type=int)
    parser.add_argument('--log-interval', help='Override log interval (default: -1, no change)', default=-1,
                        type=int)
    parser.add_argument('--eval-freq', help='Evaluate the agent every n steps (if negative, no evaluation)',
                        default=10000, type=int)
Пример #6
0
        else:
            action, value, neglogp = self.sess.run(
                [self.action, self.value_flat, self.neglogp],
                {self.obs_ph: obs})
        return action, value, self.initial_state, neglogp

    def proba_step(self, obs, state=None, mask=None):
        return self.sess.run(self.policy_proba, {self.obs_ph: obs})

    def value(self, obs, state=None, mask=None):
        return self.sess.run(self.value_flat, {self.obs_ph: obs})


if __name__ == '__main__':
    saver = U.ConfigurationSaver(log_dir='./logs')
    logger.configure(folder=saver.data_dir)

    env = gym.make('gym_docking:docking-v3')
    env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=255)

    checkpoint_callback = CheckpointCallback(
        save_freq=int(5e4),
        save_path='./logs/',
        name_prefix='rl_model_621_shaping_video_10M')

    model = PPO2(
        policy=MlpPolicy,
        env=env,
        verbose=1,
        tensorboard_log="./ppo2_docking_tensorboard/",
        lam=0.95,
def configure_logger(log_path, **kwargs):
    if log_path is not None:
        logger.configure(log_path)
    else:
        logger.configure(**kwargs)
def do_ppo(args, start_pi_theta, parent_this_run_dir, full_space_save_dir):
    """
    Runs the test
    """

    logger.log(f"#######CMA and then PPO TRAIN: {args}")

    this_conti_ppo_run_dir = get_ppo_part(parent_this_run_dir)
    log_dir = get_log_dir(this_conti_ppo_run_dir)
    conti_ppo_save_dir = get_save_dir(this_conti_ppo_run_dir)
    logger.configure(log_dir)

    full_param_traj_dir_path = get_full_params_dir(this_conti_ppo_run_dir)

    if os.path.exists(full_param_traj_dir_path):
        import shutil
        shutil.rmtree(full_param_traj_dir_path)
    os.makedirs(full_param_traj_dir_path)

    if os.path.exists(conti_ppo_save_dir):
        import shutil
        shutil.rmtree(conti_ppo_save_dir)
    os.makedirs(conti_ppo_save_dir)

    def make_env():
        env_out = gym.make(args.env)
        env_out.env.disableViewer = True
        env_out.env.visualize = False
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    if args.normalize:
        env = VecNormalize(env)

    model = PPO2.load(f"{full_space_save_dir}/ppo2")  # load after V
    model.set_pi_from_flat(
        start_pi_theta
    )  # don't set Vf's searched from CMA, those weren't really tested.

    if args.normalize:
        env.load_running_average(full_space_save_dir)
    model.set_env(env)

    run_info = {
        "run_num": args.run_num,
        "env_id": args.env,
        "full_param_traj_dir_path": full_param_traj_dir_path
    }

    # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99,
    #              noptepochs=10,
    #              ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer)

    model.tell_run_info(run_info)
    episode_returns = model.learn(total_timesteps=args.ppo_num_timesteps)

    model.save(f"{conti_ppo_save_dir}/ppo2")

    env.save_running_average(conti_ppo_save_dir)
    return episode_returns, full_param_traj_dir_path
Пример #9
0
    # Make the training environment
    env = make_train_env(datapaths)

    # Make the testing environments
    eval_envs = {}
    for d in test_datasets:
        path = os.path.join('./data', d + '.csv')
        output_path = os.path.join(anomaly_curve_log, d + '.csv')
        csv_file, csv_writer = generate_csv_writer(output_path)
        eval_envs[d] = {
            'env': make_eval_env(datapath=path, budget=args.budget),
            'csv_writer': csv_writer,
            'csv_file': csv_file,
            'mean_reward': 0,
        }

    # Train the model
    model = PPO2('MlpPolicy', env, verbose=1)
    model.set_eval(eval_envs, args.eval_log_interval)
    model.learn(total_timesteps=args.num_timesteps,
                log_interval=args.rl_log_interval)
    model.save(os.path.join(args.log, 'model'))


if __name__ == "__main__":
    parser = argsparser()
    args = parser.parse_args()
    logger.configure(args.log)
    train(args)
Пример #10
0
from stable_baselines.ppo2 import PPO2
from stable_baselines import TD3
from stable_baselines.td3.policies import MlpPolicy
from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines import logger
from stable_baselines.common.callbacks import EvalCallback
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.bench import Monitor

NUM_TIMESTEPS = int(2e7)
SEED = 721
EVAL_FREQ = 1e6
EVAL_EPISODES = 100
LOGDIR = "tank_td3" # moved to zoo afterwards.

logger.configure(folder=LOGDIR)

env = gym.make("TankGym-v0")
env.seed(SEED)
env.policy = tankgym.BaselineRand()

# The noise objects for TD3
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=1)

# eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES)

model.learn(total_timesteps=NUM_TIMESTEPS, log_interval=10)
Пример #11
0
def main():
    parser = arg_parser()
    add_env_params(parser)
    parser.add_argument('--num-timesteps', type=int, default=int(1e12))
    parser.add_argument('--num_env', type=int, default=32)
    parser.add_argument('--use_news', type=int, default=0)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--gamma_ext', type=float, default=0.99)
    parser.add_argument('--lam', type=float, default=0.95)
    parser.add_argument('--update_ob_stats_every_step', type=int, default=0)
    parser.add_argument('--update_ob_stats_independently_per_gpu',
                        type=int,
                        default=0)
    parser.add_argument('--update_ob_stats_from_random_agent',
                        type=int,
                        default=1)
    parser.add_argument('--proportion_of_exp_used_for_predictor_update',
                        type=float,
                        default=1.)
    parser.add_argument('--tag', type=str, default='')
    parser.add_argument('--policy',
                        type=str,
                        default='rnn',
                        choices=['cnn', 'rnn'])
    parser.add_argument('--int_coeff', type=float, default=1.)
    parser.add_argument('--ext_coeff', type=float, default=2.)
    parser.add_argument('--dynamics_bonus', type=int, default=0)
    parser.add_argument(
        '--save_dir',
        type=str,
        default='/home/hxu/PriorRL/random-network-distillation/ckpts/')
    parser.add_argument(
        '--load_dir',
        type=str,
        default='/home/hxu/PriorRL/random-network-distillation/ckpts/')
    parser.add_argument('--test', type=int, default=0)
    parser.add_argument('--save_image', type=int, default=0)
    parser.add_argument('--exp_name', type=str, default='tmp')
    parser.add_argument('--logdir', type=str, default='./logs/')
    parser.add_argument('--clip_rewards', type=int, default=1)
    parser.add_argument('--e_greedy', type=int, default=0)
    parser.add_argument('--action_space', type=str, default='RIGHT_ONLY')
    parser.add_argument('--load_mtype', type=str, default='latest')

    args = parser.parse_args()
    logdir = os.path.join(
        args.logdir, args.exp_name + '_' +
        datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"))
    logger.configure(folder=logdir,
                     format_strs=['stdout', 'log', 'csv']
                     if MPI.COMM_WORLD.Get_rank() == 0 else [])
    if MPI.COMM_WORLD.Get_rank() == 0:
        with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'),
                  'w') as f:
            f.write(args.tag)
        # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code'))

    mpi_util.setup_mpi_gpus()

    seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank()
    set_global_seeds(seed)

    hps = dict(frame_stack=4,
               nminibatches=4,
               nepochs=4,
               lr=0.0001,
               max_grad_norm=0.0,
               use_news=args.use_news,
               gamma=args.gamma,
               gamma_ext=args.gamma_ext,
               max_episode_steps=args.max_episode_steps,
               lam=args.lam,
               update_ob_stats_every_step=args.update_ob_stats_every_step,
               update_ob_stats_independently_per_gpu=args.
               update_ob_stats_independently_per_gpu,
               update_ob_stats_from_random_agent=args.
               update_ob_stats_from_random_agent,
               proportion_of_exp_used_for_predictor_update=args.
               proportion_of_exp_used_for_predictor_update,
               policy=args.policy,
               int_coeff=args.int_coeff,
               ext_coeff=args.ext_coeff,
               dynamics_bonus=args.dynamics_bonus)

    tf_util.make_session(make_default=True)
    train(env_id=args.env,
          num_env=args.num_env,
          seed=seed,
          num_timesteps=args.num_timesteps,
          hps=hps,
          load_dir=args.load_dir,
          save_dir=args.save_dir,
          test=args.test,
          exp_name=args.exp_name,
          clip_rewards=args.clip_rewards,
          save_image=args.save_image,
          action_space=args.action_space,
          e_greedy=args.e_greedy,
          load_mtype=args.load_mtype)
Пример #12
0
def train(
    _run,
    _seed: int,
    env_name: str,
    rollout_path: str,
    n_expert_demos: Optional[int],
    log_dir: str,
    *,
    n_epochs: int,
    n_gen_steps_per_epoch: int,
    n_disc_steps_per_epoch: int,
    init_trainer_kwargs: dict,
    n_episodes_eval: int,
    plot_interval: int,
    n_plot_episodes: int,
    show_plots: bool,
    init_tensorboard: bool,
    checkpoint_interval: int = 5,
) -> dict:
    """Train an adversarial-network-based imitation learning algorithm.

  Plots (turn on using `plot_interval > 0`):
    - Plot discriminator loss during discriminator training steps in blue and
      discriminator loss during generator training steps in red.
    - Plot the performance of the generator policy versus the performance of
      a random policy. Also plot the performance of an expert policy if that is
      provided in the arguments.

  Checkpoints:
    - DiscrimNets are saved to f"{log_dir}/checkpoints/{step}/discrim/",
      where step is either the training epoch or "final".
    - Generator policies are saved to
      f"{log_dir}/checkpoints/{step}/gen_policy/".

  Args:
    _seed: Random seed.
    env_name: The environment to train in.
    rollout_path: Path to pickle containing list of Trajectories. Used as
      expert demonstrations.
    n_expert_demos: The number of expert trajectories to actually use
      after loading them from `rollout_path`.
      If None, then use all available trajectories.
      If `n_expert_demos` is an `int`, then use exactly `n_expert_demos`
      trajectories, erroring if there aren't enough trajectories. If there are
      surplus trajectories, then use the
      first `n_expert_demos` trajectories and drop the rest.
    log_dir: Directory to save models and other logging to.

    n_epochs: The number of epochs to train. Each epoch consists of
      `n_disc_steps_per_epoch` discriminator steps followed by
      `n_gen_steps_per_epoch` generator steps.
    n_gen_steps_per_epoch: The number of generator update steps during every
      training epoch.
    n_disc_steps_per_epoch: The number of discriminator update steps during
      every training epoch.
    init_trainer_kwargs: Keyword arguments passed to `init_trainer`,
      used to initialize the trainer.
    n_episodes_eval: The number of episodes to average over when calculating
      the average episode reward of the imitation policy for return.

    plot_interval: The number of epochs between each plot. (If nonpositive,
      then plots are disabled).
    n_plot_episodes: The number of episodes averaged over when
      calculating the average episode reward of a policy for the performance
      plots.
    show_plots: Figures are always saved to `f"{log_dir}/plots/*.png"`. If
      `show_plots` is True, then also show plots as they are created.
    init_tensorboard: If True, then write tensorboard logs to `{log_dir}/sb_tb`.

    checkpoint_interval: Save the discriminator and generator models every
      `checkpoint_interval` epochs and after training is complete. If <=0,
      then only save weights after training is complete.

  Returns:
    A dictionary with two keys. "imit_stats" gives the return value of
      `rollout_stats()` on rollouts test-reward-wrapped
      environment, using the final policy (remember that the ground-truth reward
      can be recovered from the "monitor_return" key). "expert_stats" gives the
      return value of `rollout_stats()` on the expert demonstrations loaded from
      `rollout_path`.
  """
    tf.logging.info("Logging to %s", log_dir)
    os.makedirs(log_dir, exist_ok=True)
    sacred_util.build_sacred_symlink(log_dir, _run)

    # Calculate stats for expert rollouts. Used for plot and return value.
    with open(rollout_path, "rb") as f:
        expert_trajs = pickle.load(f)

    if n_expert_demos is not None:
        assert len(expert_trajs) >= n_expert_demos
        expert_trajs = expert_trajs[:n_expert_demos]

    expert_stats = util.rollout.rollout_stats(expert_trajs)

    with util.make_session():
        sb_logger.configure(folder=osp.join(log_dir, 'generator'),
                            format_strs=['tensorboard', 'stdout'])

        if init_tensorboard:
            sb_tensorboard_dir = osp.join(log_dir, "sb_tb")
            kwargs = init_trainer_kwargs
            kwargs["init_rl_kwargs"] = kwargs.get("init_rl_kwargs", {})
            kwargs["init_rl_kwargs"]["tensorboard_log"] = sb_tensorboard_dir

        trainer = init_trainer(env_name,
                               expert_trajs,
                               seed=_seed,
                               log_dir=log_dir,
                               **init_trainer_kwargs)

        if plot_interval > 0:
            visualizer = _TrainVisualizer(
                trainer=trainer,
                show_plots=show_plots,
                n_episodes_per_reward_data=n_plot_episodes,
                log_dir=log_dir,
                expert_mean_ep_reward=expert_stats["return_mean"])
        else:
            visualizer = None

        # Main training loop.
        for epoch in tqdm.tqdm(range(1, n_epochs + 1), desc="epoch"):
            trainer.train_disc(n_disc_steps_per_epoch)
            if visualizer:
                visualizer.add_data_disc_loss(False)

            trainer.train_gen(n_gen_steps_per_epoch)
            if visualizer:
                visualizer.add_data_disc_loss(True)

            if visualizer and epoch % plot_interval == 0:
                visualizer.plot_disc_loss()
                visualizer.add_data_ep_reward(trainer.venv,
                                              "Ground Truth Reward")
                visualizer.add_data_ep_reward(trainer.venv_train,
                                              "Train Reward")
                visualizer.add_data_ep_reward(trainer.venv_test, "Test Reward")
                visualizer.plot_ep_reward()

            if checkpoint_interval > 0 and epoch % checkpoint_interval == 0:
                save(trainer,
                     os.path.join(log_dir, "checkpoints", f"{epoch:05d}"))

        # Save final artifacts.
        save(trainer, os.path.join(log_dir, "checkpoints", "final"))

        # Final evaluation of imitation policy.
        results = {}
        sample_until_eval = util.rollout.min_episodes(n_episodes_eval)
        trajs = util.rollout.generate_trajectories(
            trainer.gen_policy,
            trainer.venv_test,
            sample_until=sample_until_eval)
        results["imit_stats"] = util.rollout.rollout_stats(trajs)
        results["expert_stats"] = expert_stats

        return results
Пример #13
0
def main(args):

  logger.configure(config.LOGDIR)

  if args.debug:
    logger.set_level(config.DEBUG)
  else:
    logger.set_level(config.INFO)
    
  #make environment
  env = get_environment(args.env_name)(verbose = args.verbose, manual = args.manual)
  env.seed(args.seed)






  total_rewards = {}

  if args.recommend:
    ppo_model = load_model(env, 'best_model.zip')
    ppo_agent = Agent('best_model', ppo_model)
  else:
    ppo_agent = None


  agents = []

  #load the agents
  if len(args.agents) != env.n_players:
    raise Exception(f'{len(args.agents)} players specified but this is a {env.n_players} player game!')


  for i, agent in enumerate(args.agents):
    if agent == 'human':
      agent_obj = Agent('human')
    elif agent== 'greedy':
      agent_obj = Agent('greedy')


    elif agent == 'rules':
      agent_obj = Agent('rules')
    elif agent == 'base':
      base_model = load_model(env, 'base.zip')
      agent_obj = Agent('base', base_model)   
    else:
      ppo_model = load_model(env, f'{agent}.zip')
      agent_obj = Agent(agent, ppo_model)
    agents.append(agent_obj)
    total_rewards[agent_obj.id] = 0

  if args.env_name == "blobwar":
    human_blobwar = Human()

  #play games
  logger.info(f'\nPlaying {args.games} games...')
  for game in range(args.games):
    players = agents[:]

    if args.randomise_players:
      random.shuffle(players)

    obs = env.reset()
    done = False
    
    for i, p in enumerate(players):
      logger.debug(f'Player {i+1} = {p.name}')

    while not done:

      current_player = players[env.current_player_num]
      env.render()
      logger.debug(f'Current player name: {current_player.name}')

      if args.recommend and current_player.name in ['human', 'rules']:
        # show recommendation from last loaded model
        logger.debug(f'\nRecommendation by {ppo_agent.name}:')
        action = ppo_agent.choose_action(env, choose_best_action = True, mask_invalid_actions = True)

      if current_player.name == 'human':
        if args.env_name == "blobwar":

          move= human_blobwar.compute_next_move(env.core)
          action=env.encode_action(move)
        else:
          action = input('\nPlease choose an action: ')


        try:
          action = int(action)
        except:
          # for MulitDiscrete action input as list TODO
          action = eval(action)
      elif current_player.name == 'rules':
        logger.debug(f'\n{current_player.name} model choices')
        action = current_player.choose_action(env, choose_best_action = False, mask_invalid_actions = True)
      else:
        logger.debug(f'\n{current_player.name} model choices')
        action = current_player.choose_action(env, choose_best_action = args.best, mask_invalid_actions = True)

      obs, reward, done, _ = env.step(action)

      for r, player in zip(reward, players):
        total_rewards[player.id] += r
        player.points += r

      if args.cont:
        input('Press any key to continue')
    
    env.render()

    logger.info(f"Played {game + 1} games: {total_rewards}")

    if args.write_results:
      write_results(players, game, args.games, env.turns_taken)

    for p in players:
      p.points = 0

  env.close()
Пример #14
0
def main():
    args = parser().parse_args()
    cfg = YAML().load(
        open(os.environ["FLIGHTMARE_PATH"] + "/flightlib/configs/vec_env.yaml",
             'r'))
    if not args.train:
        cfg["env"]["num_envs"] = 1
        cfg["env"]["num_threads"] = 1

    if args.render:
        cfg["env"]["render"] = "yes"
    else:
        cfg["env"]["render"] = "no"

    env = wrapper.FlightEnvVec(
        QuadrotorEnv_v1(dump(cfg, Dumper=RoundTripDumper), False))

    # set random seed
    configure_random_seed(args.seed, env=env)

    #
    if args.train:
        # save the configuration and other files
        rsg_root = os.path.dirname(os.path.abspath(__file__))
        log_dir = rsg_root + '/saved'
        saver = U.ConfigurationSaver(log_dir=log_dir)
        model = PPO2(
            tensorboard_log=saver.data_dir,
            policy=MlpPolicy,  # check activation function
            policy_kwargs=dict(net_arch=[dict(pi=[128, 128], vf=[128, 128])],
                               act_fun=tf.nn.relu),
            env=env,
            lam=0.95,
            gamma=0.99,  # lower 0.9 ~ 0.99
            # n_steps=math.floor(cfg['env']['max_time'] / cfg['env']['ctl_dt']),
            n_steps=250,
            ent_coef=0.00,
            learning_rate=3e-4,
            vf_coef=0.5,
            max_grad_norm=0.5,
            nminibatches=1,
            noptepochs=10,
            cliprange=0.2,
            verbose=1,
        )

        # tensorboard
        # Make sure that your chrome browser is already on.
        # TensorboardLauncher(saver.data_dir + '/PPO2_1')

        # PPO run
        # Originally the total timestep is 5 x 10^8
        # 10 zeros for nupdates to be 4000
        # 1000000000 is 2000 iterations and so
        # 2000000000 is 4000 iterations.
        logger.configure(folder=saver.data_dir)
        model.learn(total_timesteps=int(25000000),
                    log_dir=saver.data_dir,
                    logger=logger)
        model.save(saver.data_dir)

    # # Testing mode with a trained weight
    else:
        model = PPO2.load(args.weight)
        test_model(env, model, render=args.render)
Пример #15
0
def create_test_env(env_id,
                    n_envs=1,
                    is_atari=False,
                    stats_path=None,
                    norm_reward=False,
                    seed=0,
                    log_dir='',
                    should_render=True):
    """
    Create environment for testing a trained agent

    :param env_id: (str)
    :param n_envs: (int) number of processes
    :param is_atari: (bool)
    :param stats_path: (str) path to folder containing saved running averaged
    :param norm_reward: (bool) Whether to normalize rewards or not when using Vecnormalize
    :param seed: (int) Seed for random number generator
    :param log_dir: (str) Where to log rewards
    :param should_render: (bool) For Pybullet env, display the GUI
    :return: (gym.Env)
    """
    # HACK to save logs
    if log_dir is not None:
        os.environ["OPENAI_LOG_FORMAT"] = 'csv'
        os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir)
        os.makedirs(log_dir, exist_ok=True)
        logger.configure()

    # Create the environment and wrap it if necessary
    if is_atari:
        print("Using Atari wrapper")
        env = make_atari_env(env_id, num_env=n_envs, seed=seed)
        # Frame-stacking with 4 frames
        env = VecFrameStack(env, n_stack=4)
    elif n_envs > 1:
        env = SubprocVecEnv(
            [make_env(env_id, i, seed, log_dir) for i in range(n_envs)])
    # Pybullet envs does not follow gym.render() interface
    elif "Bullet" in env_id:
        spec = gym.envs.registry.env_specs[env_id]
        class_ = load(spec._entry_point)
        # HACK: force SubprocVecEnv for Bullet env that does not
        # have a render argument
        use_subproc = 'renders' not in inspect.getfullargspec(
            class_.__init__).args

        # Create the env, with the original kwargs, and the new ones overriding them if needed
        def _init():
            # TODO: fix for pybullet locomotion envs
            env = class_(**{**spec._kwargs}, renders=should_render)
            env.seed(0)
            if log_dir is not None:
                env = Monitor(env,
                              os.path.join(log_dir, "0"),
                              allow_early_resets=True)
            return env

        if use_subproc:
            env = SubprocVecEnv([make_env(env_id, 0, seed, log_dir)])
        else:
            env = DummyVecEnv([_init])
    else:
        env = DummyVecEnv([make_env(env_id, 0, seed, log_dir)])

    # Load saved stats for normalizing input and rewards
    if stats_path is not None:
        print("Loading running average")
        env = VecNormalize(env, training=False, norm_reward=norm_reward)
        env.load_running_average(stats_path)
    return env
Пример #16
0
def main(args):

    logger.configure(config.LOGDIR)

    if args.debug:
        logger.set_level(config.DEBUG)
    else:
        logger.set_level(config.INFO)

    # make environment
    env = get_environment(args.env_name)(verbose=args.verbose,
                                         manual=args.manual)
    env.seed(args.seed)

    total_rewards = {}

    first_time = True

    if args.recommend:
        ppo_model = load_model(env, 'best_model.zip')
        ppo_agent = Agent('best_model', ppo_model)
    else:
        ppo_agent = None

    agents = []

    # load the agents
    if len(args.agents) != env.n_players:
        raise Exception(
            f'{len(args.agents)} players specified but this is a {env.n_players} player game!'
        )

    for i, agent in enumerate(args.agents):
        if agent == 'human':
            agent_obj = Agent('human')
        elif agent == 'rules':
            agent_obj = Agent('rules')
        elif agent == 'json':
            # Start mq server
            context = zmq.Context()
            socket = context.socket(zmq.REP)
            socket.bind("tcp://*:5555")
            logger.debug("zaq server start at 5555")
            agent_obj = Agent('json')
        elif agent == 'base':
            base_model = load_model(env, 'base.zip')
            agent_obj = Agent('base', base_model)
        else:
            ppo_model = load_model(env, f'{agent}.zip')
            agent_obj = Agent(agent, ppo_model)
        agents.append(agent_obj)
        total_rewards[agent_obj.id] = 0

    # play games
    logger.info(f'\nPlaying {args.games} games...')
    for game in range(args.games):
        players = agents[:]

        if args.randomise_players:
            random.shuffle(players)

        obs = env.reset()
        done = False

        for i, p in enumerate(players):
            logger.debug(f'Player {i+1} = {p.name}')

        while not done:

            current_player = players[env.current_player_num]
            env.render()
            logger.debug(f'\nCurrent player name: {current_player.name}')

            if args.recommend and current_player.name in [
                    'human', 'rules', 'json'
            ]:
                # show recommendation from last loaded model
                logger.debug(f'\nRecommendation by {ppo_agent.name}:')
                action = ppo_agent.choose_action(env,
                                                 choose_best_action=True,
                                                 mask_invalid_actions=True)

            if current_player.name == 'human':
                action = input('\nPlease choose an action: ')
                try:
                    # for int actions
                    action = int(action)
                except:
                    # for MulitDiscrete action input as list TODO
                    action = eval(action)

            if current_player.name == 'json':
                if (not first_time):
                    game_state = {
                        "legal_action":
                        [i for i, o in enumerate(env.legal_actions) if o != 0],
                        "tableCard":
                        env.tableCard.id
                    }

                    socket.send_json(game_state)

                action = socket.recv_json()
                first_time = False
                logger.debug(f'\nReceived {action}')

                #  action = input('\n JSON!!! Please choose an action: ')
                try:
                    # for int actions
                    action = int(action)
                except:
                    # for MulitDiscrete action input as list TODO
                    action = eval(action)
            elif current_player.name == 'rules':
                logger.debug(f'\n{current_player.name} model choices')
                action = current_player.choose_action(
                    env, choose_best_action=False, mask_invalid_actions=True)
            else:
                logger.debug(f'\n{current_player.name} model choices')
                action = current_player.choose_action(
                    env,
                    choose_best_action=args.best,
                    mask_invalid_actions=True)

            obs, reward, done, _ = env.step(action)

            for r, player in zip(reward, players):
                total_rewards[player.id] += r
                player.points += r

            if args.cont:
                input('Press any key to continue')

        env.render()

        logger.info(f"Played {game + 1} games: {total_rewards}")

        if args.write_results:
            write_results(players, game, args.games, env.turns_taken)

        for p in players:
            p.points = 0

    env.close()
Пример #17
0
def rollouts_and_policy(
    _seed: int,
    env_name: str,
    total_timesteps: int,
    *,
    log_dir: str = None,
    num_vec: int = 8,
    parallel: bool = False,
    max_episode_steps: Optional[int] = None,
    normalize: bool = True,
    make_blank_policy_kwargs: dict = {},
    reward_type: Optional[str] = None,
    reward_path: Optional[str] = None,
    rollout_save_interval: int = 0,
    rollout_save_final: bool = False,
    rollout_save_n_timesteps: Optional[int] = None,
    rollout_save_n_episodes: Optional[int] = None,
    policy_save_interval: int = -1,
    policy_save_final: bool = True,
) -> None:
    """Trains an expert policy from scratch and saves the rollouts and policy.

  At applicable training steps `step` (where step is either an integer or
  "final"):

      - Policies are saved to `{log_dir}/policies/{step}.pkl`.
      - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`.

  Args:
      env_name: The gym.Env name. Loaded as VecEnv.
      total_timesteps: Number of training timesteps in `model.learn()`.
      log_dir: The root directory to save metrics and checkpoints to.
      num_vec: Number of environments in VecEnv.
      parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv.
      max_episode_steps: If not None, then environments are wrapped by
          TimeLimit so that they have at most `max_episode_steps` steps per
          episode.
      normalize: If True, then rescale observations and reward.
      make_blank_policy_kwargs: Kwargs for `make_blank_policy`.

      reward_type: If provided, then load the serialized reward of this type,
          wrapping the environment in this reward. This is useful to test
          whether a reward model transfers. For more information, see
          `imitation.rewards.serialize.load_reward`.
      reward_path: A specifier, such as a path to a file on disk, used by
          reward_type to load the reward model. For more information, see
          `imitation.rewards.serialize.load_reward`.

      rollout_save_interval: The number of training updates in between
          intermediate rollout saves. If the argument is nonpositive, then
          don't save intermediate updates.
      rollout_save_final: If True, then save rollouts right after training is
          finished.
      rollout_save_n_timesteps: The minimum number of timesteps saved in every
          file. Could be more than `rollout_save_n_timesteps` because
          trajectories are saved by episode rather than by transition.
          Must set exactly one of `rollout_save_n_timesteps`
          and `rollout_save_n_episodes`.
      rollout_save_n_episodes: The number of episodes saved in every
          file. Must set exactly one of `rollout_save_n_timesteps` and
          `rollout_save_n_episodes`.

      policy_save_interval: The number of training updates between saves. Has
          the same semantics are `rollout_save_interval`.
      policy_save_final: If True, then save the policy right after training is
          finished.
  """
    _validate_traj_generate_params(rollout_save_n_timesteps,
                                   rollout_save_n_episodes)

    with util.make_session():
        tf.logging.set_verbosity(tf.logging.INFO)
        sb_logger.configure(folder=osp.join(log_dir, 'rl'),
                            format_strs=['tensorboard', 'stdout'])

        rollout_dir = osp.join(log_dir, "rollouts")
        policy_dir = osp.join(log_dir, "policies")
        os.makedirs(rollout_dir, exist_ok=True)
        os.makedirs(policy_dir, exist_ok=True)

        venv = util.make_vec_env(env_name,
                                 num_vec,
                                 seed=_seed,
                                 parallel=parallel,
                                 log_dir=log_dir,
                                 max_episode_steps=max_episode_steps)

        log_callbacks = []
        with contextlib.ExitStack() as stack:
            if reward_type is not None:
                reward_fn_ctx = load_reward(reward_type, reward_path, venv)
                reward_fn = stack.enter_context(reward_fn_ctx)
                venv = RewardVecEnvWrapper(venv, reward_fn)
                log_callbacks.append(venv.log_callback)
                tf.logging.info(
                    f"Wrapped env in reward {reward_type} from {reward_path}.")

            vec_normalize = None
            if normalize:
                venv = vec_normalize = VecNormalize(venv)

            policy = util.init_rl(venv, verbose=1, **make_blank_policy_kwargs)

            # Make callback to save intermediate artifacts during training.
            step = 0

            def callback(locals_: dict, _) -> bool:
                nonlocal step
                step += 1
                policy = locals_['self']

                # TODO(adam): make logging frequency configurable
                for callback in log_callbacks:
                    callback(sb_logger)

                if rollout_save_interval > 0 and step % rollout_save_interval == 0:
                    util.rollout.save(rollout_dir,
                                      policy,
                                      venv,
                                      step,
                                      n_timesteps=rollout_save_n_timesteps,
                                      n_episodes=rollout_save_n_episodes)
                if policy_save_interval > 0 and step % policy_save_interval == 0:
                    output_dir = os.path.join(policy_dir, f'{step:05d}')
                    serialize.save_stable_model(output_dir, policy,
                                                vec_normalize)
                return True  # Continue training.

            policy.learn(total_timesteps, callback=callback)

            # Save final artifacts after training is complete.
            if rollout_save_final:
                util.rollout.save(rollout_dir,
                                  policy,
                                  venv,
                                  "final",
                                  n_timesteps=rollout_save_n_timesteps,
                                  n_episodes=rollout_save_n_episodes)
            if policy_save_final:
                output_dir = os.path.join(policy_dir, "final")
                serialize.save_stable_model(output_dir, policy, vec_normalize)
Пример #18
0
def main():
    global model, best_model_path, last_model_path, sim_joy
    mission = 'PushStonesEnv'  # Change according to algorithm
    env = gym.make(mission + '-v0').unwrapped

    # Create log and model dir
    dir = 'stable_bl/' + mission
    # dir = 'stable_bl/PushMultipleStones'
    os.makedirs(dir + '/model_dir/sac', exist_ok=True)

    jobs = ['train', 'record', 'record-w/hm', 'BC_agent', 'play']
    job = jobs[0]

    name = 'PickUp_40_episodes'
    pretrain = False
    fillBuffer = False

    if job == 'train':

        # create new folder
        try:
            tests = os.listdir(dir + '/model_dir/sac')
            indexes = []
            for item in tests:
                indexes.append(int(item.split('_')[1]))
            if not bool(indexes):
                k = 0
            else:
                k = max(indexes) + 1
        except FileNotFoundError:
            os.makedirs(dir + '/log_dir/sac')
            k = 0

        model_dir = os.getcwd() + '/' + dir + '/model_dir/sac/test_{}'.format(
            str(k))

        best_model_path = model_dir
        last_model_path = model_dir

        log_dir = dir + '/log_dir/sac/test_{}'.format(str(k))
        logger.configure(folder=log_dir,
                         format_strs=['stdout', 'log', 'csv', 'tensorboard'])

        num_timesteps = int(1e6)

        policy_kwargs = dict(layers=[64, 64, 64])

        # SAC - start learning from scratch
        model = SAC(CnnPolicy,
                    env,
                    gamma=0.99,
                    learning_rate=1e-4,
                    buffer_size=50000,
                    learning_starts=1000,
                    train_freq=1,
                    batch_size=64,
                    tau=0.01,
                    ent_coef='auto',
                    target_update_interval=1,
                    gradient_steps=1,
                    target_entropy='auto',
                    action_noise=None,
                    random_exploration=0.0,
                    verbose=2,
                    tensorboard_log=log_dir,
                    _init_setup_model=True,
                    full_tensorboard_log=True,
                    seed=None,
                    n_cpu_tf_sess=None)

        # Load best model and continue learning
        # models = os.listdir(dir + '/model_dir/sac')
        # models_rew = (model for model in models if 'rew' in model)
        # ind, reward = [], []
        # for model in models_rew:
        #     ind.append(model.split('_')[1])
        #     reward.append(model.split('_')[3])
        # best_reward = max(reward)
        # best_model_ind = reward.index(best_reward)
        # k = ind[best_model_ind]
        # model = SAC.load(dir + '/model_dir/sac/test_' + k + '_rew_' + best_reward, env=env,
        #                  custom_objects=dict(learning_starts=0))
        # Load last saved model and continue learning
        # models = os.listdir(dir + '/model_dir/sac')
        # models_time = (model for model in models if 'rew' not in model)
        # ind, hour, min = [], [], []
        # for model in models_time:
        #     ind.append(model.split('_')[1])
        #     hour.append(model.split('_')[3])
        #     min.append(model.split('_')[4])
        # date = models_time[0].split('_')[2]
        # latest_hour = max(hour)
        # latest_hour_ind = [i for i, n in enumerate(hour) if n == latest_hour]
        # latest_min = max(min[latest_hour_ind])
        # latest_min_ind = min(latest_min)
        # k = ind[latest_min_ind]
        # model = SAC.load(dir + '/model_dir/sac/test_' + k + '_' + date + '_' + latest_hour[0] + '_' + latest_min + 'zip',
        #                  env=env, custom_objects=dict(learning_starts=0))

        # model = SAC.load(dir + '/model_dir/sac/test_0_11_16_2.zip',
        #                  env=env, tensorboard_log=log_dir,
        #                  custom_objects=dict(learning_starts=0)) #, learning_rate=2e-4,
        #                                      # train_freq=8, gradient_steps=4, target_update_interval=4))
        # #                                              # batch_size=32))

        # pretrain
        if pretrain:
            # load dataset only once
            # expert_dataset(name)
            dataset = ExpertDataset(expert_path=(os.getcwd() + '/' + name +
                                                 '_dataset.npz'),
                                    traj_limitation=-1)
            model.pretrain(dataset, n_epochs=2000)

        # fill replay buffer with Benny's recordings
        if fillBuffer:
            traj = expert_dataset(name)

            for i in range(len(traj['obs']) - 1):
                if traj['episode_starts'][i + 1]:
                    done = True
                else:
                    done = False

                obs = traj['obs'][i]
                action = traj['actions'][i]
                reward = traj['rewards'][i]
                next_obs = traj['obs'][i + 1]

                model.replay_buffer.add(obs, action, reward, next_obs,
                                        float(done))

        # Test the pre-trained model
        # env = model.get_env()
        # obs = env.reset()
        #
        # reward_sum = 0.0
        # for _ in range(1000):
        #     action, _ = model.predict(obs)
        #     obs, reward, done, _ = env.step(action)
        #     reward_sum += reward
        #     if done:
        #         print(reward_sum)
        #         reward_sum = 0.0
        #         obs = env.reset()
        #
        # env.close()

        # learn
        model.learn(total_timesteps=num_timesteps, callback=save_fn)

        # PPO1
        # model = PPO1(Common_MlpPolicy, env, gamma=0.99, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01,
        #      optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, lam=0.95, adam_epsilon=1e-5,
        #      schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True,
        #      policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1)

        # TRPO
        # model = TRPO(MlpPolicy, env, timesteps_per_batch=4096, tensorboard_log=log_dir, verbose=1)
        # model.learn(total_timesteps=500000)
        # model.save(log_dir)

    elif job == 'record':

        mission = 'PushStonesHeatMapEnv'
        env = gym.make(mission + '-v0').unwrapped

        obs = []
        actions = []
        rewards = []
        dones = []
        episode_rewards = []

        num_episodes = 30

        listener = keyboard.Listener(on_press=on_press)
        listener.start()

        for episode in range(num_episodes):

            ob = env.reset()
            done = False
            print('Episode number ', episode + 1)
            episode_reward = 0

            while not done:

                act = "recording"
                # act = sim_joy
                # act = [0,1,0.5]
                new_ob, reward, done, info = env.step(act)

                # print(info['action'])
                # print(ob)

                if recorder_on:
                    obs.append(ob)
                    actions.append(info['action'])
                    rewards.append(reward)
                    dones.append(done)
                    episode_reward = episode_reward + reward

                ob = new_ob

            episode_rewards.append(episode_reward)

            if info['reset reason'] == 'out of boarders' or info[
                    'reset reason'] == 'limit time steps':
                episode -= 1
            else:
                print('saving data')
                data_saver(obs, actions, rewards, dones, episode_rewards)

    elif job == 'play':
        # env = gym.make('PickUpEnv-v0')
        model = SAC.load(dir + '/model_dir/sac/test_0_11_16_2.zip',
                         env=env,
                         custom_objects=dict(learning_starts=0))  ### ADD NUM

        for _ in range(2):

            obs = env.reset()
            done = False
            while not done:
                action, _states = model.predict(obs)
                obs, reward, done, info = env.step(action)
    if policy == "dnn":
        if algo == "dqn":
            from stable_baselines.deepq.policies import MlpPolicy
        else:
            from stable_baselines.common.policies import MlpPolicy
        policyFn = MlpPolicy
    elif policy == "bnn":
        if algo == "dqn":
            from dqn_model import BnnPolicy
        else:
            from model import BnnPolicy
        policyFn = BnnPolicy

    log_dir = f"{algo}-{policy}-{tag}"
    logger.configure(folder=log_dir)

    env = gym.make("SlimeVolley-v0")
    env.atari_mode = True
    env.survival_bonus = True
    env.__init__()
    env.seed(seed)

    eval_env = gym.make("SlimeVolley-v0")
    eval_env.atari_mode = True
    eval_env.__init__()
    eval_env.seed(seed)
    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path=log_dir,
                                 log_path=log_dir,
                                 eval_freq=eval_freq,
Пример #20
0
    def train(
        self,
        seed: int,
        communication_queue: Queue = None,
        current_iteration: int = -1,
        search_suffix: str = "1",
        env_variables: EnvVariables = None,
        random_search: bool = False,
    ):

        self._set_global_seed(seed=seed)

        env_kwargs_to_set = env_variables if env_variables else self.env_kwargs
        self.logger.debug("env_variables: {}".format(env_kwargs_to_set.get_params_string()))

        reward_threshold = get_reward_threshold(env_name=self.env_name)

        best_model_save_path, tensorboard_log_dir = self._preprocess_storage_dirs()

        if current_iteration != -1 and not self.continue_learning:
            best_model_save_path = best_model_save_path + "_" + str(current_iteration)

        self.logger.debug("best_model_save_path: {}".format(best_model_save_path))

        if communication_queue or search_suffix != "1":
            continue_learning_suffix = self.continue_learning_suffix + "_" + search_suffix
        else:
            continue_learning_suffix = self.continue_learning_suffix

        os.environ["OPENAI_LOG_FORMAT"] = "log,csv"
        if self.continue_learning:
            os.environ["OPENAI_LOGDIR"] = best_model_save_path + "_" + continue_learning_suffix
        else:
            os.environ["OPENAI_LOGDIR"] = best_model_save_path
        configure()

        if self.algo_hyperparams:
            self.logger.debug("Overriding file specified hyperparams with {}".format(eval(self.algo_hyperparams)))
            hyperparams = eval(self.algo_hyperparams)
        else:
            hyperparams = load_hyperparams(algo_name=self.algo_name, env_name=self.env_name, model_suffix=self.model_suffix)

        (normalize_kwargs, n_envs, n_timesteps, log_every, hyperparams,) = self._preprocess_hyperparams(
            _hyperparams=hyperparams
        )

        if n_envs > 1 and self.algo_name == "ppo2":
            # On most env, SubprocVecEnv does not help and is quite memory hungry
            env = DummyVecEnv(
                [
                    make_env_parallel(
                        sb_version=self.sb_version,
                        seed=seed,
                        rank=i,
                        env_name=self.env_name,
                        continue_learning=self.continue_learning,
                        log_dir=best_model_save_path,
                        env_kwargs=env_kwargs_to_set,
                        algo_name=self.algo_name,
                        continue_learning_suffix=continue_learning_suffix,
                    )
                    for i in range(n_envs)
                ]
            )
            if len(normalize_kwargs) > 0:
                env = normalize_env(
                    env=env,
                    vectorize=False,
                    orig_log_dir=best_model_save_path,
                    continue_learning=self.continue_learning,
                    sb_version=self.sb_version,
                    normalize_kwargs=normalize_kwargs,
                )
        else:
            env = make_custom_env(
                seed=seed,
                sb_version=self.sb_version,
                env_kwargs=env_kwargs_to_set,
                normalize_kwargs=normalize_kwargs,
                continue_learning=self.continue_learning,
                log_dir=best_model_save_path,
                env_name=self.env_name,
                algo_name=self.algo_name,
                continue_learning_suffix=continue_learning_suffix,
            )

        if self.n_eval_episodes > DEFAULT_N_EVAL_EPISODES:
            analysis_callback = self.build_callback(
                algo_name=self.algo_name,
                continue_learning=self.continue_learning,
                call_every=log_every,
                eval_callback=self.eval_callback,
                _reward_threshold=reward_threshold,
                eval_episodes=self.n_eval_episodes,
                _eval_env=make_custom_env(
                    seed=seed,
                    continue_learning=self.continue_learning,
                    sb_version=self.sb_version,
                    env_kwargs=env_kwargs_to_set,
                    env_name=self.env_name,
                    log_dir=best_model_save_path,
                    algo_name=self.algo_name,
                    normalize_kwargs=normalize_kwargs,
                    evaluate=True,
                    evaluate_during_learning=True,
                    continue_learning_suffix=continue_learning_suffix,
                ),
                original_env=make_custom_env(
                    seed=seed,
                    continue_learning=self.continue_learning,
                    sb_version=self.sb_version,
                    env_kwargs=self.env_kwargs,
                    env_name=self.env_name,
                    log_dir=best_model_save_path,
                    algo_name=self.algo_name,
                    normalize_kwargs=normalize_kwargs,
                    evaluate=True,
                    evaluate_during_learning=True,
                ),
                env_name=self.env_name,
                _best_model_save_path=best_model_save_path,
                num_envs=n_envs,
                total_timesteps=n_timesteps,
                continue_learning_suffix=continue_learning_suffix,
                communication_queue=communication_queue,
                env_eval_callback=self.env_eval_callback,
                save_replay_buffer=self.save_replay_buffer,
                save_model=self.save_model,
                random_search=random_search,
            )
        else:
            analysis_callback = self.build_callback(
                algo_name=self.algo_name,
                continue_learning=self.continue_learning,
                call_every=log_every,
                eval_callback=self.eval_callback,
                _reward_threshold=reward_threshold,
                eval_episodes=self.n_eval_episodes,
                env_name=self.env_name,
                _best_model_save_path=best_model_save_path,
                num_envs=n_envs,
                continue_learning_suffix=continue_learning_suffix,
                save_replay_buffer=self.save_replay_buffer,
                save_model=self.save_model,
                random_search=random_search,
            )

        if self.continue_learning:
            model = self.create_model(
                seed=seed,
                algo_name=self.algo_name,
                env=env,
                tensorboard_log_dir=tensorboard_log_dir,
                hyperparams=hyperparams,
                best_model_save_path=best_model_save_path,
                n_timesteps=n_timesteps,
                continue_learning=True,
                env_name=self.env_name,
                model_to_load=self.model_to_load,
                save_replay_buffer=self.save_replay_buffer,
            )
        else:
            model = self.create_model(
                seed=seed,
                algo_name=self.algo_name,
                env=env,
                tensorboard_log_dir=tensorboard_log_dir,
                hyperparams=hyperparams,
                env_name=self.env_name,
                n_timesteps=n_timesteps,
                model_to_load=self.model_to_load,
                save_replay_buffer=self.save_replay_buffer,
            )

        try:
            callback_list = [analysis_callback]

            # if len(normalize_kwargs) > 0 and not self.continue_learning:
            #     callback_list = [self._build_vec_normalize_callback(save_path=best_model_save_path,
            #                                                         log_every=log_every), analysis_callback]

            if self.show_progress_bar:
                with ProgressBarManager(total_timesteps=n_timesteps, sb_version=self.sb_version) as progress_callback:
                    callback_list.append(progress_callback)
                    if self.continue_learning and self.log_to_tensorboard:
                        model.learn(
                            total_timesteps=n_timesteps,
                            callback=callback_list,
                            tb_log_name=self.tb_log_name + "_" + continue_learning_suffix,
                        )
                    else:
                        model.learn(
                            total_timesteps=n_timesteps, callback=callback_list, tb_log_name=self.tb_log_name,
                        )

            else:
                if self.continue_learning and self.log_to_tensorboard:
                    model.learn(
                        total_timesteps=n_timesteps,
                        callback=callback_list,
                        tb_log_name=self.tb_log_name + "_" + continue_learning_suffix,
                    )
                else:
                    self.logger.debug("Model learn start...")
                    model.learn(
                        total_timesteps=n_timesteps, callback=callback_list, tb_log_name=self.tb_log_name,
                    )
                    self.logger.debug("Model learn end")
        except KeyboardInterrupt:
            pass
        finally:
            if len(normalize_kwargs) > 0 and not self.continue_learning:
                # Important: save the running average, for testing the agent we need that normalization
                model.get_vec_normalize_env().save(os.path.join(best_model_save_path, "vecnormalize.pkl"))

            # Release resources
            env.close()
Пример #21
0
def create_test_env(env_id,
                    n_envs=1,
                    is_atari=False,
                    stats_path=None,
                    seed=0,
                    log_dir='',
                    should_render=True,
                    hyperparams=None):
    """
    Create environment for testing a trained agent

    :param env_id: (str)
    :param n_envs: (int) number of processes
    :param is_atari: (bool)
    :param stats_path: (str) path to folder containing saved running averaged
    :param seed: (int) Seed for random number generator
    :param log_dir: (str) Where to log rewards
    :param should_render: (bool) For Pybullet env, display the GUI
    :param env_wrapper: (type) A subclass of gym.Wrapper to wrap the original
                        env with
    :param hyperparams: (dict) Additional hyperparams (ex: n_stack)
    :return: (gym.Env)
    """
    # HACK to save logs
    if log_dir is not None:
        os.environ["OPENAI_LOG_FORMAT"] = 'csv'
        os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir)
        os.makedirs(log_dir, exist_ok=True)
        logger.configure()

    # Create the environment and wrap it if necessary
    env_wrapper = get_wrapper_class(hyperparams)
    if 'env_wrapper' in hyperparams.keys():
        del hyperparams['env_wrapper']

    if is_atari:
        print("Using Atari wrapper")
        env = make_atari_env(env_id, num_env=n_envs, seed=seed)
        # Frame-stacking with 4 frames
        env = VecFrameStack(env, n_stack=4)
    elif n_envs > 1:
        # start_method = 'spawn' for thread safe
        env = SubprocVecEnv([
            make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper)
            for i in range(n_envs)
        ])
    # Pybullet envs does not follow gym.render() interface
    elif "Bullet" in env_id:
        spec = gym.envs.registry.env_specs[env_id]
        try:
            class_ = load(spec.entry_point)
        except AttributeError:
            # Backward compatibility with gym
            class_ = load(spec._entry_point)
        # HACK: force SubprocVecEnv for Bullet env that does not
        # have a render argument
        render_name = None
        use_subproc = 'renders' not in inspect.getfullargspec(
            class_.__init__).args
        if not use_subproc:
            render_name = 'renders'
        # Dev branch of pybullet
        # use_subproc = use_subproc and 'render' not in inspect.getfullargspec(class_.__init__).args
        # if not use_subproc and render_name is None:
        #     render_name = 'render'

        # Create the env, with the original kwargs, and the new ones overriding them if needed
        def _init():
            # TODO: fix for pybullet locomotion envs
            env = class_(**{**spec._kwargs}, **{render_name: should_render})
            env.seed(0)
            if log_dir is not None:
                env = Monitor(env,
                              os.path.join(log_dir, "0"),
                              allow_early_resets=True)
            return env

        if use_subproc:
            env = SubprocVecEnv([
                make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)
            ])
        else:
            env = DummyVecEnv([_init])
    else:
        env = DummyVecEnv(
            [make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)])

    # Load saved stats for normalizing input and rewards
    # And optionally stack frames
    if stats_path is not None:
        if hyperparams['normalize']:
            print("Loading running average")
            print("with params: {}".format(hyperparams['normalize_kwargs']))
            env = VecNormalize(env,
                               training=False,
                               **hyperparams['normalize_kwargs'])
            env.load_running_average(stats_path)

        n_stack = hyperparams.get('frame_stack', 0)
        if n_stack > 0:
            print("Stacking {} frames".format(n_stack))
            env = VecFrameStack(env, n_stack)
    return env
Пример #22
0
    os.makedirs(top_log_dir, exist_ok=True)
    test_num = 1
    for hyperparams in hyperparams_list:
        hyperparam_log_dir = os.path.join(top_log_dir, hyper_file_name(hyperparams))
        os.makedirs(hyperparam_log_dir, exist_ok=True)
        print("Beginning test", test_num, "of", len(hyperparams_list))
        begin_perm_time = datetime.now()
        for i in range(5,10):
            run_dir = os.path.join(hyperparam_log_dir, "run_" + str(i) + "_monitor_dir")
            hyperparamfilename = os.path.join(run_dir, "hyperparams.txt")
            if os.path.exists(hyperparamfilename):
                continue
            os.makedirs(run_dir, exist_ok=True)
            checkpoint_dir = os.path.join(run_dir, "model_checkpoints")
            os.makedirs(checkpoint_dir, exist_ok=True)
            logger.configure(run_dir)

            env = create_env(n_envs=n_envs, env_name=env_name, log_dir=run_dir)
            model = RLAgent('MlpPolicy', env, verbose=0, **hyperparams).learn(total_timesteps=timesteps, callback=callback)
            model.save(run_dir + "final_agent.pkl")
            del model
            del env
            gc.collect()
            hyperparamfile = open(hyperparamfilename, 'w')
            hyperparamfile.write(str(hyperparams))
            hyperparamfile.write("\nn_envs = {}\n".format(n_envs))
            hyperparamfile.write("RLAgent = {}\n".format(RLAgent))
            hyperparamfile.write("Env = {}\n".format(args.env))
            hyperparamfile.close()
        print("time remaining:", (datetime.now() - begin_perm_time) * (len(hyperparams_list) - test_num))
        test_num += 1
Пример #23
0
    lr = args.lr
    kwargs = kwargs_map[args.agent]
    kwargs['learning_rate'] = lr
    # kwargs['max_grad_norm'] = args.max_grad_norm
    # kwargs['kfac_clip'] = args.kfac_clip
    # kwargs['vf_coef'] = args.vf_coef
    # kwargs['ent_coef'] = args.ent_coef
    # kwargs['n_steps'] = args.n_steps

    start_time = datetime.now()

    log_dir = os.path.join("training_logs", run_name)
    checkpoint_dir = os.path.join(log_dir, "model_checkpoints")
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(checkpoint_dir, exist_ok=True)
    logger.configure(log_dir)

    env = create_env(n_envs=args.n_envs, env_name=args.env, log_dir=log_dir)
    RLAgent = AGENTS_MAP[args.agent]
    hyperparamfilename = os.path.join(log_dir, "hyperparams.txt")
    hyperparamfile = open(hyperparamfilename, 'w')
    hyperparamfile.write(str(kwargs))
    hyperparamfile.write("\nn_envs = {}\n".format(args.n_envs))
    hyperparamfile.write("RLAgent = {}\n".format(RLAgent))
    hyperparamfile.write("Env = {}\n".format(args.env))
    hyperparamfile.close()
    model = RLAgent('MlpPolicy', env, verbose=1, **kwargs).learn(total_timesteps=total_timesteps, callback=callback)
    model.save(log_dir + "final_agent.pkl")
    # env.save("trained_agents/env_" + run_name)
    print(kwargs)
    #
Пример #24
0
def do_ppos(ppos_args, result, intermediate_data_dir, origin_param):

    ppos_args.alg = "ppo_subspace"

    logger.log(f"#######TRAIN: {ppos_args}")
    this_run_dir = get_dir_path_for_this_run(ppos_args)
    if os.path.exists(this_run_dir):
        import shutil
        shutil.rmtree(this_run_dir)
    os.makedirs(this_run_dir)

    log_dir = get_log_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)
    full_param_traj_dir_path = get_full_params_dir(this_run_dir)
    if os.path.exists(full_param_traj_dir_path):
        import shutil
        shutil.rmtree(full_param_traj_dir_path)
    os.makedirs(full_param_traj_dir_path)

    if os.path.exists(save_dir):
        import shutil
        shutil.rmtree(save_dir)
    os.makedirs(save_dir)

    run_info = {"full_param_traj_dir_path": full_param_traj_dir_path}

    logger.configure(log_dir)

    tic = time.time()

    def make_env():
        env_out = gym.make(ppos_args.env)
        env_out.env.disableViewer = True
        env_out.env.visualize = False

        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    if ppos_args.normalize:
        env = VecNormalize(env)

    set_global_seeds(ppos_args.seed)
    policy = MlpMultPolicy

    model = PPO2(policy=policy,
                 env=env,
                 n_steps=ppos_args.n_steps,
                 nminibatches=ppos_args.nminibatches,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=10,
                 ent_coef=0.0,
                 learning_rate=3e-4,
                 cliprange=0.2,
                 policy_kwargs={"num_comp": len(result["first_n_pcs"])},
                 pcs=result["first_n_pcs"],
                 origin_theta=origin_param)
    model.tell_run_info(run_info)

    eprews, optimization_path = model.learn(
        total_timesteps=ppos_args.ppos_num_timesteps,
        give_optimization_path=True)

    toc = time.time()
    logger.log(
        f"####################################PPOS took {toc-tic} seconds")

    moving_ave_rewards = get_moving_aves(eprews, 100)

    return eprews, moving_ave_rewards, optimization_path
Пример #25
0
def create_test_env(env_id, n_envs=1, is_atari=False,
                    stats_path=None, seed=0,
                    log_dir='', should_render=True, hyperparams=None):
    """
    Create environment for testing a trained agent

    :param env_id: (str)
    :param n_envs: (int) number of processes
    :param is_atari: (bool)
    :param stats_path: (str) path to folder containing saved running averaged
    :param seed: (int) Seed for random number generator
    :param log_dir: (str) Where to log rewards
    :param should_render: (bool) For Pybullet env, display the GUI
    :param env_wrapper: (type) A subclass of gym.Wrapper to wrap the original
                        env with
    :param hyperparams: (dict) Additional hyperparams (ex: n_stack)
    :return: (gym.Env)
    """
    # HACK to save logs
    if log_dir is not None:
        os.environ["OPENAI_LOG_FORMAT"] = 'csv'
        os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir)
        os.makedirs(log_dir, exist_ok=True)
        logger.configure()

    if hyperparams is None:
        hyperparams = {}

    # Create the environment and wrap it if necessary
    env_wrapper = get_wrapper_class(hyperparams)
    if 'env_wrapper' in hyperparams.keys():
        del hyperparams['env_wrapper']

    if is_atari:
        print("Using Atari wrapper")
        env = make_atari_env(env_id, num_env=n_envs, seed=seed)
        # Frame-stacking with 4 frames
        env = VecFrameStack(env, n_stack=4)
    elif n_envs > 1:
        # start_method = 'spawn' for thread safe
        env = SubprocVecEnv([make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper) for i in range(n_envs)])
    # Pybullet envs does not follow gym.render() interface
    elif "Bullet" in env_id:
        # HACK: force SubprocVecEnv for Bullet env
        env = SubprocVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)])
    else:
        env = DummyVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)])

    # Load saved stats for normalizing input and rewards
    # And optionally stack frames
    if stats_path is not None:
        if hyperparams['normalize']:
            print("Loading running average")
            print("with params: {}".format(hyperparams['normalize_kwargs']))
            env = VecNormalize(env, training=False, **hyperparams['normalize_kwargs'])

            if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')):
                env = VecNormalize.load(os.path.join(stats_path, 'vecnormalize.pkl'), env)
                # Deactivate training and reward normalization
                env.training = False
                env.norm_reward = False
            else:
                # Legacy:
                env.load_running_average(stats_path)

        n_stack = hyperparams.get('frame_stack', 0)
        if n_stack > 0:
            print("Stacking {} frames".format(n_stack))
            env = VecFrameStack(env, n_stack)
    return env
Пример #26
0
def train(env_id, algo, num_timesteps, seed, sgd_steps, t_pi, t_c, lam, log,
          expert_path, pretrain, pretrain_epochs, mdpo_update_steps,
          num_trajectories, expert_model, exploration_bonus, bonus_coef,
          random_action_len, is_action_features, dir_name, neural, lipschitz,
          args):
    """
    Train TRPO model for the mujoco environment, for testing purposes
    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """

    with tf_util.single_threaded_session():
        # from mpi4py import MPI
        # rank = MPI.COMM_WORLD.Get_rank()
        rank = 0
        env_name = env_id[:-3].lower()
        log_dir = './experiments/' + env_name + '/' + str(algo).lower() + '/'\
                  + 'tpi' + str(t_pi) + '_tc' + str(t_c) + '_lam' + str(lam)
        log_dir += '_' + dir_name + '/'
        log_name = str(algo) + '_updateSteps' + str(mdpo_update_steps)
        # log_name += '_randLen' + str(random_action_len)
        if exploration_bonus:
            log_name += '_exploration' + str(bonus_coef)
        if pretrain:
            log_name += '_pretrain' + str(pretrain_epochs)
        if not is_action_features:
            log_name += "_states_only"
        log_name += '_s' + str(seed)

        log_path = log_dir + log_name
        expert_path = './experts/' + expert_path

        num_timesteps = int(num_timesteps)

        args = args.__dict__

        dir_path = os.getcwd() + log_dir[1:]
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
            with open(os.getcwd() + log_dir[1:] + 'args.txt', 'w') as file:
                file.write("Experiment Arguments:")
                for key, val in args.items():
                    print(key, ": ", val, file=file)

        if log:
            if rank == 0:
                logger.configure(log_path)
            else:
                logger.configure(log_path, format_strs=[])
                logger.set_level(logger.DISABLED)
        else:
            if rank == 0:
                logger.configure()
            else:
                logger.configure(format_strs=[])
                logger.set_level(logger.DISABLED)

        # workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

        # env = make_mujoco_env(env_id, workerseed)
        def make_env():
            # env_out = gym.make(env_id, reset_noise_scale=1.0)
            env_out = gym.make(env_id)
            env_out = bench.Monitor(env_out,
                                    logger.get_dir(),
                                    allow_early_resets=True)
            env_out.seed(seed)
            env_out = wrap_mujoco(env_out, random_action_len=random_action_len)
            return env_out

        #

        env = DummyVecEnv([make_env])
        # env = VecNormalize(env)

        if algo == 'Train':
            train = True
        else:
            train = False

        if algo == 'Evaluate':
            eval = True
        else:
            eval = False

        if train:
            from stable_baselines import SAC
            env = VecNormalize(env, norm_reward=False, norm_obs=False)

            if num_timesteps > 0:
                model = SAC('MlpPolicy',
                            env_id,
                            verbose=1,
                            buffer_size=1000000,
                            batch_size=256,
                            ent_coef='auto',
                            train_freq=1,
                            tau=0.01,
                            gradient_steps=1,
                            learning_starts=10000)
            else:
                model = SAC.load(expert_model, env)
            generate_expert_traj(model,
                                 expert_path,
                                 n_timesteps=num_timesteps,
                                 n_episodes=num_trajectories)
            if num_timesteps > 0:
                model.save('sac_' + env_name + '_' + str(num_timesteps))
        elif eval:
            from stable_baselines import SAC
            env = VecNormalize(env, norm_reward=False, norm_obs=False)
            model = SAC.load(expert_model, env)
            generate_expert_traj(model,
                                 expert_path,
                                 n_timesteps=num_timesteps,
                                 n_episodes=10,
                                 evaluate=True)
        else:
            expert_path = expert_path + '.npz'
            dataset = ExpertDataset(expert_path=expert_path,
                                    traj_limitation=10,
                                    verbose=1)

            if algo == 'MDAL':
                model = MDAL_MDPO_OFF('MlpPolicy',
                                      env,
                                      dataset,
                                      verbose=1,
                                      tensorboard_log="./experiments/" +
                                      env_name + "/mdal/",
                                      seed=seed,
                                      buffer_size=1000000,
                                      ent_coef=0.0,
                                      learning_starts=10000,
                                      batch_size=256,
                                      tau=0.01,
                                      gamma=0.99,
                                      gradient_steps=sgd_steps,
                                      mdpo_update_steps=mdpo_update_steps,
                                      lam=0.0,
                                      train_freq=1,
                                      d_step=10,
                                      tsallis_q=1,
                                      reparameterize=True,
                                      t_pi=t_pi,
                                      t_c=t_c,
                                      exploration_bonus=exploration_bonus,
                                      bonus_coef=bonus_coef,
                                      is_action_features=is_action_features,
                                      neural=neural,
                                      lipschitz=lipschitz)
            elif algo == 'MDAL_ON_POLICY':
                model = MDAL_MDPO_ON('MlpPolicy',
                                     env,
                                     dataset,
                                     verbose=1,
                                     timesteps_per_batch=2048,
                                     tensorboard_log="./experiments/" +
                                     env_name + "/mdal_mdpo_on/",
                                     seed=seed,
                                     max_kl=0.01,
                                     cg_iters=10,
                                     cg_damping=0.1,
                                     entcoeff=0.0,
                                     adversary_entcoeff=0.001,
                                     gamma=0.99,
                                     lam=0.95,
                                     vf_iters=5,
                                     vf_stepsize=1e-3,
                                     sgd_steps=sgd_steps,
                                     klcoeff=1.0,
                                     method="multistep-SGD",
                                     tsallis_q=1.0,
                                     t_pi=t_pi,
                                     t_c=t_c,
                                     exploration_bonus=exploration_bonus,
                                     bonus_coef=bonus_coef,
                                     is_action_features=is_action_features,
                                     neural=neural)

            elif algo == 'MDAL_TRPO':
                model = MDAL_TRPO('MlpPolicy',
                                  env,
                                  dataset,
                                  verbose=1,
                                  tensorboard_log="./experiments/" + env_name +
                                  "/mdal_trpo/",
                                  seed=seed,
                                  gamma=0.99,
                                  g_step=3,
                                  d_step=5,
                                  sgd_steps=1,
                                  d_stepsize=9e-5,
                                  entcoeff=0.0,
                                  adversary_entcoeff=0.001,
                                  max_kl=t_pi,
                                  t_pi=t_pi,
                                  t_c=t_c,
                                  exploration_bonus=exploration_bonus,
                                  bonus_coef=bonus_coef,
                                  is_action_features=is_action_features,
                                  neural=neural,
                                  lam=0.98,
                                  timesteps_per_batch=2000,
                                  lipschitz=lipschitz)

            elif algo == 'GAIL':
                from mpi4py import MPI
                from stable_baselines import GAIL

                model = GAIL('MlpPolicy',
                             env,
                             dataset,
                             verbose=1,
                             tensorboard_log="./experiments/" + env_name +
                             "/gail/",
                             seed=seed,
                             entcoeff=0.0,
                             adversary_entcoeff=0.001,
                             lipschitz=lipschitz)

            elif algo == 'GAIL_MDPO_OFF':
                # from mpi4py import MPI
                from stable_baselines import GAIL_MDPO_OFF

                model = GAIL_MDPO_OFF('MlpPolicy',
                                      env,
                                      dataset,
                                      verbose=1,
                                      tensorboard_log="./experiments/" +
                                      env_name + "/gail_mdpo_off/",
                                      seed=seed,
                                      ent_coef=0.0,
                                      adversary_entcoeff=0.001,
                                      buffer_size=1000000,
                                      learning_starts=10000,
                                      batch_size=256,
                                      tau=0.01,
                                      gamma=0.99,
                                      gradient_steps=sgd_steps,
                                      mdpo_update_steps=mdpo_update_steps,
                                      lam=0.0,
                                      train_freq=1,
                                      tsallis_q=1,
                                      reparameterize=True,
                                      t_pi=t_pi,
                                      t_c=t_c,
                                      exploration_bonus=exploration_bonus,
                                      bonus_coef=bonus_coef,
                                      is_action_features=is_action_features,
                                      lipschitz=lipschitz)
            else:
                raise ValueError("Not a valid algorithm.")

            if pretrain:
                model.pretrain(dataset, n_epochs=pretrain_epochs)

            model.learn(total_timesteps=num_timesteps, tb_log_name=log_name)

        env.close()
Пример #27
0
import sys

sys.path.append('/home/frcvision1/Final/My_Environments/Carla-0.9.4')
sys.path.append('/home/frcvision1/Final/learning-to-drive-in-a-day-carla-0.9')
from stable_baselines.common.vec_env import DummyVecEnv
from vae.controller import VAEController
from stable_baselines import logger
import os
from ppo_with_vae import PPOWithVAE
from stable_baselines.ppo2.ppo2 import PPO2
from stable_baselines.common.policies import MlpPolicy
import numpy as np

vae = VAEController()
PATH_MODEL_VAE = "vae.json"
logger.configure(folder='/tmp/ppo_carla2/')
PATH_MODEL_PPO2 = "carla_ppo2_with_vae_500_2mil"


def make_carla_env():
    """Import the package for carla Env, this packge calls the __init__ that registers the environment.Did this just to
    be consistent with gym"""
    sys.path.append('/home/frcvision1/Final/My_Environments/Carla_new')
    from env3 import CarlaEnv
    env = CarlaEnv()
    env = DummyVecEnv([lambda: env])
    return env


env = make_carla_env()
Пример #28
0
def launch(env,
           logdir,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           override_params=None,
           save_policies=True):
    """
    launch training with mpi

    :param env: (str) environment ID
    :param logdir: (str) the log directory
    :param n_epochs: (int) the number of training epochs
    :param num_cpu: (int) the number of CPUs to run on
    :param seed: (int) the initial random seed
    :param replay_strategy: (str) the type of replay strategy ('future' or 'none')
    :param policy_save_interval: (int) the interval with which policy pickles are saved.
        If set to 0, only the best and latest policy will be pickled.
    :param clip_return: (float): clip returns to be in [-clip_return, clip_return]
    :param override_params: (dict) override any parameter for training
    :param save_policies: (bool) whether or not to save the policies
    """

    if override_params is None:
        override_params = {}
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        try:
            whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
        except CalledProcessError:
            # fancy version of mpi call failed, try simple version
            whoami = mpi_fork(num_cpu)

        if whoami == 'parent':
            sys.exit(0)
        tf_util.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(folder=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env
    params['replay_strategy'] = replay_strategy
    if env in config.DEFAULT_ENV_PARAMS:
        params.update(
            config.DEFAULT_ENV_PARAMS[env])  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'),
              'w') as file_handler:
        json.dump(params, file_handler)
    params = config.prepare_params(params)
    config.log_params(params, logger_input=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/stable_baselines/issues/314 for further details.'
        )
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        # 'use_demo_states': True,
        'compute_q': False,
        'time_horizon': params['time_horizon'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        # 'use_demo_states': False,
        'compute_q': True,
        'time_horizon': params['time_horizon'],
    }

    for name in [
            'time_horizon', 'rollout_batch_size', 'gamma', 'noise_eps',
            'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)

    train(policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies)
Пример #29
0
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--critic-l2-reg', type=float, default=1e-2)
    parser.add_argument('--batch-size', type=int, default=64)  # per MPI worker
    parser.add_argument('--actor-lr', type=float, default=1e-4)
    parser.add_argument('--critic-lr', type=float, default=1e-3)
    boolean_flag(parser, 'enable-popart', default=False)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--reward-scale', type=float, default=1.)
    parser.add_argument('--clip-norm', type=float, default=None)
    parser.add_argument('--nb-train-steps', type=int,
                        default=50)  # per epoch cycle and MPI worker
    parser.add_argument('--nb-eval-steps', type=int,
                        default=100)  # per epoch cycle and MPI worker
    parser.add_argument('--nb-rollout-steps', type=int,
                        default=100)  # per epoch cycle and MPI worker
    # choices are adaptive-param_xx, ou_xx, normal_xx, none
    parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2')
    parser.add_argument('--num-timesteps', type=int, default=int(1e6))
    boolean_flag(parser, 'evaluation', default=False)
    args = parser.parse_args()
    dict_args = vars(args)
    return dict_args


if __name__ == '__main__':
    args = parse_args()
    if MPI.COMM_WORLD.Get_rank() == 0:
        logger.configure()
    # Run actual script.
    run(**args)
Пример #30
0
def main():
    """
    Runs the test
    """
    parser = mujoco_arg_parser()
    parser.add_argument(
        '--model-path',
        default="/cvgl2/u/surajn/workspace/saved_models/sawyerlift_ppo2/model")
    parser.add_argument('--images', default=False)
    args = parser.parse_args()

    logger.configure()
    if not args.play:
        model, env = train(args.env,
                           num_timesteps=args.num_timesteps,
                           seed=args.seed,
                           model_path=args.model_path,
                           images=args.images)

    if args.play:

        def make_env():
            env_out = GymWrapper(
                suite.make(
                    "SawyerLift",
                    use_camera_obs=False,  # do not use pixel observations
                    has_offscreen_renderer=
                    False,  # not needed since not using pixel obs
                    has_renderer=True,  # make sure we can render to the screen
                    reward_shaping=True,  # use dense rewards
                    control_freq=
                    10,  # control should happen fast enough so that simulation looks smooth
                ))
            env_out.reward_range = None
            env_out.metadata = None
            env_out.spec = None
            env_out = bench.Monitor(env_out,
                                    logger.get_dir(),
                                    allow_early_resets=True)
            return env_out

        #env = make_env()
        env = DummyVecEnv([make_env])
        env = VecNormalize(env)

        policy = MlpPolicy
        #model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
        #         optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=1)
        model = TRPO(MlpPolicy,
                     env,
                     timesteps_per_batch=1024,
                     max_kl=0.01,
                     cg_iters=10,
                     cg_damping=0.1,
                     entcoeff=0.0,
                     gamma=0.99,
                     lam=0.98,
                     vf_iters=5,
                     vf_stepsize=1e-3)
        model.load(args.model_path)
        logger.log("Running trained model")
        obs = np.zeros((env.num_envs, ) + env.observation_space.shape)
        obs[:] = env.reset()
        while True:
            env.render()
            actions = model.step(obs)[0]
            obs[:] = env.step(actions)[0]