示例#1
0
def setup_env(env_name, train=True):
    if args.env == "CartPole-v0":
        env = gym.make(env_name)
    else:
        env = make_atari(env_name)
        if train:
            env = wrap_deepmind(env, episode_life=True, clip_rewards=False,
                                frame_stack=True, scale=True)    
        else:
            env = wrap_deepmind(env, episode_life=False, clip_rewards=False,
                                frame_stack=True, scale=True)    

    return env
示例#2
0
    def _thunk():
        # random_seed(seed)
        if env_id.startswith("dm"):
            import dm_control2gym
            _, domain, task = env_id.split('-')
            env = dm_control2gym.make(domain_name=domain, task_name=task)
        else:
            env = gym.make(env_id)
        is_atari = hasattr(gym.envs, 'atari') and isinstance(
            env.unwrapped, gym.envs.atari.atari_env.AtariEnv)
        if is_atari:
            env = make_atari(env_id)
        # env.seed(seed + rank)
        env = OriginalReturnWrapper(env)
        if is_atari:
            env = wrap_deepmind(env,
                                episode_life=episode_life,
                                clip_rewards=False,
                                frame_stack=False,
                                scale=False)
            obs_shape = env.observation_space.shape
            if len(obs_shape) == 3:
                env = TransposeImage(env)
            env = FrameStack(env, 4)

        return env
示例#3
0
def train(env_id, num_timesteps, seed):
    """
    Train PPO1 model for Atari environments, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    rank = MPI.COMM_WORLD.Get_rank()

    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_atari(env_id)

    env = bench.Monitor(env, logger.get_dir() and
                        os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    env = wrap_deepmind(env)
    env.seed(workerseed)

    model = PPO1(CnnPolicy, env, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4,
                 optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2)
    model.learn(total_timesteps=num_timesteps)
    env.close()
    del env
示例#4
0
 def _thunk():
   env = make_atari(env_id)
   env = gym.wrappers.Monitor(env, '/tmp/video', force=True, video_callable=lambda ep: True)
   env.seed(seed + rank)
   env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
                 allow_early_resets=allow_early_resets)
   return wrap_deepmind(env, **wrapper_kwargs)
示例#5
0
 def _thunk():
     env = make_atari(env_id)
     env.seed(seed + rank)
     env = Monitor(env,
                   logger.get_dir()
                   and os.path.join(logger.get_dir(), str(rank)),
                   allow_early_resets=allow_early_resets)
     return wrap_deepmind(env, **wrapper_kwargs)
示例#6
0
def make_env():

	env = wrap_deepmind(make_atari("PongNoFrameskip-v4"))
	workerseed = MPI.COMM_WORLD.Get_rank()*10000
	env.seed(workerseed)

	env = single_agent_wrapper(env)
	return env
	
示例#7
0
def wrap_atari_dqn(env):
    """
    wrap the environment in atari wrappers for DQN

    :param env: (Gym Environment) the environment
    :return: (Gym Environment) the wrapped environment
    """
    from stable_baselines.common.atari_wrappers import wrap_deepmind
    return wrap_deepmind(env, frame_stack=True, scale=False)
示例#8
0
def setup_env(env_name, train=True):
    if env_name in ["CartPole-v0", "SpaceInvaders-ram-v0"]:
        env = gym.make(env_name)
    else:
        env = make_atari(env_name)
        if train:
            env = wrap_deepmind(env,
                                episode_life=True,
                                clip_rewards=False,
                                frame_stack=True,
                                scale=True)
        else:
            env = wrap_deepmind(env,
                                episode_life=False,
                                clip_rewards=False,
                                frame_stack=True,
                                scale=True)

    return env
def train_ppo(env_id,
              num_timesteps,
              seed,
              policy,
              save_params,
              n_envs=1,
              nminibatches=5,
              n_steps=8000):
    """
     env_id: typr str, identifies each environment uniquely
     num_timesteps: number of timesteps to run the algorithm
     seed: initial random seed
     policy: policy to be followed (mlp, cnn, lstm, etc)
     n_env: number of envs to run in parallel
     nminibatches: number of minibatches of mini batch gradient descent (first-order optimization) to update the policy params
     n_steps: number of steps in each update
    """
    # Train PPO algorithm for num_timesteps
    # stack the frames for the vectorized environment
    # Note: PPO2 works only with vectorized environment

    set_global_seeds(seed)
    env = make_atari(env_id)
    env.seed(seed)
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = wrap_deepmind(env, frame_stack=True)
    # define the policy
    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    # create model object for class PPO2
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=n_steps,
                 nminibatches=nminibatches,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=4,
                 ent_coef=.01,
                 learning_rate=lambda f: f * 2.5e-4,
                 cliprange=lambda f: f * 0.1,
                 verbose=1)
    # train the model
    # trained for 2e7 timesteps with seed = 5
    model.learn(total_timesteps=num_timesteps, callback=callback)
    # save the hyperparameters and weights
    model.save(save_params)
    env.close()
    # free the memory
    del model
示例#10
0
 def _thunk():
     env = make_atari(env_id)
     env.seed(seed + rank)
     # env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
     #               allow_early_resets=allow_early_resets)
     if logdir is not None:
         env = Monitor(env, os.path.join(logdir, str(rank)), allow_early_resets=allow_early_resets)
     env = wrap_deepmind(env, **wrapper_kwargs)
     if extra_wrapper_func is not None:
         return extra_wrapper_func(env)
     else:
         return env
示例#11
0
def make_env():

	# create pong environment and use wrappers from stable baselines
	env = wrap_deepmind(make_atari("PongNoFrameskip-v4"))
	workerseed = MPI.COMM_WORLD.Get_rank()*10000
	env.seed(workerseed)

	# convert standard gym interface to multiagent interface expected by ai arena
	env = single_agent_wrapper(env)
	return env
	
	
def create_env(args, idx):
    """
    Create and return an environment according to args (parsed arguments).
    idx specifies idx of this environment among parallel environments.
    """
    monitor_file = os.path.join(args.output, ("env_%d" % idx))

    # Check for Atari envs
    if "NoFrameskip" in args.env:
        env = make_atari(args.env)
        env = wrap_deepmind(env, frame_stack=True)
    else:
        env = gym.make(args.env)
    env = Monitor(env, monitor_file)

    return env
示例#13
0
def train(env_id, num_timesteps, seed):
    """
    Train TRPO model for the atari environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    rank = MPI.COMM_WORLD.Get_rank()

    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])

    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_atari(env_id)

    # def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None):  # pylint: disable=W0613
    #     return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, placeholders=placeholders)

    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    env = wrap_deepmind(env)
    env.seed(workerseed)

    model = TRPO(CnnPolicy,
                 env,
                 timesteps_per_batch=512,
                 max_kl=0.001,
                 cg_iters=10,
                 cg_damping=1e-3,
                 entcoeff=0.0,
                 gamma=0.98,
                 lam=1,
                 vf_iters=3,
                 vf_stepsize=1e-4)
    model.learn(total_timesteps=int(num_timesteps * 1.1))
    env.close()
def train_trpo(env_id, num_timesteps, seed):

    # env_id: typr str, identifies each environment uniquely
    # num_timesteps: number of timesteps to run the algorithm
    # seed: initial random seed

    # set up the environment
    rank = MPI.COMM_WORLD.Get_rank()
    sseed = seed + 10000 * rank
    set_global_seeds(sseed)
    env = make_atari(env_id)
    env.seed(sseed)
    env = wrap_deepmind(make_atari(env_id))
    env.seed(sseed)
    # define policies
    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    # define TRPO class object
    model = TRPO(policy=policy,
                 env=env,
                 timesteps_per_batch=1024,
                 max_kl=0.01,
                 cg_iters=10,
                 cg_dampling=1e-3,
                 ent_coef=0.0,
                 gamma=0.99,
                 lam=1,
                 vf_iters=3,
                 vf_stepsize=1e-4,
                 verbose=1)
    # Train TRPO for num_timesteps
    model.learn(total_timesteps=num_timesteps)
    # save the hyperparameters and weights
    model.save('trpo' + env_id)
    env.close()
    # free the memory
    del model
def train_dqn_adv(env_id, train_timesteps, seed, policy, save_params, n_envs = 1):
    set_global_seeds(seed)
    env = make_atari(env_id)
    env.seed(seed)
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = wrap_deepmind(env, frame_stack=True)
    # define the policy
    policy = {'cnn': CnnPolicy, 'mlp': MlpPolicy}[policy]
    # create model object for class DQN
    model = DQN(policy = policy, env = env, gamma=0.99, learning_rate=0.0001, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, 
                exploration_initial_eps=1.0, train_freq=4, batch_size=32, double_q=True, learning_starts=10000, target_network_update_freq=1000, 
                prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-06, 
                param_noise=False, n_cpu_tf_sess=None, verbose=1)
    callback = save_best_model_callback(save_freq = 100, log_dir = log_dir, save_params = save_params, verbose=1)
    # train the model
    # trained for 2e7 timesteps with seed = 7
    model.learn(total_timesteps = train_timesteps, callback = callback)
    plot_results([log_dir], train_timesteps, results_plotter.X_TIMESTEPS, "DQNPong_TrainedByAdversary")
    plt.show()
    env.close()
    # free the memory
    del model
示例#16
0
    def _thunk():
        if env_id.startswith("dm"):
            _, domain, task = env_id.split('.')
            env = dm_control2gym.make(domain_name=domain, task_name=task)
        else:
            env = gym.make(env_id)

        is_atari = hasattr(gym.envs, 'atari') and isinstance(
            env.unwrapped, gym.envs.atari.atari_env.AtariEnv)
        if is_atari:
            env = make_atari(env_id)

        env.seed(seed + rank)

        if str(env.__class__.__name__).find('TimeLimit') >= 0:
            env = TimeLimitMask(env)

        if log_dir is not None:
            env = bench.Monitor(env,
                                os.path.join(log_dir, str(rank)),
                                allow_early_resets=allow_early_resets)

        if is_atari:
            if len(env.observation_space.shape) == 3:
                env = wrap_deepmind(env)
        elif len(env.observation_space.shape) == 3:
            raise NotImplementedError(
                "CNN models work only for atari,\n"
                "please use a custom wrapper for a custom pixel input env.\n"
                "See wrap_deepmind for an example.")

        # If the input has shape (W,H,3), wrap for PyTorch convolutions
        obs_shape = env.observation_space.shape
        if len(obs_shape) == 3 and obs_shape[2] in [1, 3]:
            env = TransposeImage(env, op=[2, 0, 1])

        return env
示例#17
0
def setup_wandb(args):
    config = dict(env=args.env, max_frames=args.max_frames)
    wandb.init(project='rlmp',
               notes='Random Agent',
               tags=['Random'],
               config=config)


if __name__ == "__main__":

    args = get_args()
    setup_wandb(args)
    video_path = 'tmp/video/{}'.format(wandb.run.id)

    env = make_atari(args.env)
    env = wrap_deepmind(env)
    env = wrappers.Monitor(gym.make(args.env),
                           video_path,
                           video_callable=lambda x: x % 20 == 0)

    # Configure display
    virtual_display = Display(visible=0, size=(320, 240))
    virtual_display.start()

    num_frames = 0
    while num_frames < args.max_frames:

        state = env.reset()
        done = False
        ep_reward = 0
        while not done:
示例#18
0
            if done:
                break
        trajectories.append(traj)
    env.close()
    return trajectories


if __name__ == "__main__":
    # room_size = 10
    # num_tasks = 2
    # work_per_task = 8
    # env = Room(room_size, num_tasks, work_per_task, max_steps=200)
    # log_dir = "/content/drive/My Drive/Colab Notebooks/imitation_RL"
    log_dir = "."
    env_name = "BreakoutNoFrameskip-v4"
    env = gym.make(env_name)
    env = wrap_deepmind(env, frame_stack=True, clip_rewards=False)
    num_trajectories = 1
    trajectories = get_trajectories_continuous(env,
                                               num_trajectories,
                                               get_human_act,
                                               lowest_reward=30)
    print(
        f"average reward: {np.mean([sum(traj['rew']) for traj in trajectories])}"
    )
    print()

    trajectory_file = os.path.join(log_dir, f"{env_name}_expert.pkl")
    with open(trajectory_file, "wb") as f:
        dill.dump(trajectories, f)
示例#19
0
 def _thunk():
     env = make_atari(env_id)
     env.seed(seed + rank)
     env = Monitor(env, os.path.join(logdir, '{:03d}.monitor.csv'.format(rank)),
                   allow_early_resets=allow_early_resets)
     return wrap_deepmind(env, **wrapper_kwargs)
示例#20
0
 def _thunk():
     env = gym.make(env_id, frameskip=config.frameskip)
     env = NoopResetEnv(env, noop_max=30)
     env.seed(seed + rank)
     return wrap_deepmind(env, **wrapper_kwargs)