Пример #1
0
def make_vec_envs(id, num_processes, gamma, return_evn_vector=False, **kwargs):
    start_port = kwargs['port']
    ports = range(start_port, start_port + num_processes)

    env_vector = []
    for i in range(num_processes):
        kwargs['port'] = ports[i]
        env_vector.append(_make_env_fn(id, i, **kwargs))

    if len(env_vector) > 1:
        envs = SubprocVecEnv(env_vector)
    else:
        envs = DummyVecEnv(env_vector)

    if len(envs.observation_space.shape) == 1:
        use_tf = True
        if gamma is None:
            envs = VecNormalize(envs, ret=False, use_tf=use_tf)
        else:
            envs = VecNormalize(envs, gamma=gamma, use_tf=use_tf)

    import tensorflow as tf
    from baselines.common.tf_util import get_session
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=1,
                            inter_op_parallelism_threads=1)
    config.gpu_options.allow_growth = True
    get_session(config=config)

    if return_evn_vector:
        return envs, env_vector
    return envs
Пример #2
0
def make_vec_envs_pytorch(id,
                          num_processes,
                          gamma,
                          device,
                          return_evn_vector=False,
                          **kwargs):
    from a2c_ppo_acktr.envs import VecPyTorch
    start_port = kwargs['port']
    ports = range(start_port, start_port + num_processes)

    env_vector = []
    for i in range(num_processes):
        kwargs['port'] = ports[i]
        env_vector.append(_make_env_fn(id, i, **kwargs))

    if len(env_vector) > 1:
        envs = SubprocVecEnv(env_vector)
    else:
        envs = DummyVecEnv(env_vector)

    if len(envs.observation_space.shape) == 1:
        if gamma is None:
            envs = VecNormalize(envs, ret=False)
        else:
            envs = VecNormalize(envs, gamma=gamma)

    envs = VecPyTorch(envs, device)

    if return_evn_vector:
        return envs, env_vector
    return envs
Пример #3
0
def build_env(args):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = args.seed

    env_type, env_id = get_env_type(args.env)

    if env_type == 'atari':
        if alg == 'acer':
            env = make_vec_env(env_id, env_type, nenv, seed)
        elif alg == 'deepq':
            env = atari_wrappers.make_atari(env_id)
            env.seed(seed)
            env = bench.Monitor(env, logger.get_dir())
            env = atari_wrappers.wrap_deepmind(env, frame_stack=True)
        elif alg == 'trpo_mpi':
            env = atari_wrappers.make_atari(env_id)
            env.seed(seed)
            env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
            env = atari_wrappers.wrap_deepmind(env)
            # TODO check if the second seeding is necessary, and eventually remove
            env.seed(seed)
        else:
            frame_stack_size = 4
            env = VecFrameStack(make_vec_env(env_id, env_type, nenv, seed), frame_stack_size)

    elif env_type == 'retro':
        import retro
        gamestate = args.gamestate or retro.State.DEFAULT
        env = retro_wrappers.make_retro(game=args.env, state=gamestate, max_episode_steps=10000,
                                        use_restricted_actions=retro.Actions.DISCRETE)
        env.seed(args.seed)
        env = bench.Monitor(env, logger.get_dir())
        env = retro_wrappers.wrap_deepmind_retro(env)

    elif env_type == 'unity':
       get_session(tf.ConfigProto(allow_soft_placement=True,
                                   intra_op_parallelism_threads=1,
                                   inter_op_parallelism_threads=1))
       env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale)
       env = VecNormalize(env)

    else:
       get_session(tf.ConfigProto(allow_soft_placement=True,
                                   intra_op_parallelism_threads=1,
                                   inter_op_parallelism_threads=1))

       env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale)

       if env_type == 'mujoco' or env_type == 'unity':
           env = VecNormalize(env)

    return env
Пример #4
0
def wrap_env_pytorch(env, gamma, device):
    from a2c_ppo_acktr.envs import VecPyTorch
    envs = DummyVecEnv([env])

    if len(envs.observation_space.shape) == 1:
        if gamma is None:
            envs = VecNormalize(envs, ret=False)
        else:
            envs = VecNormalize(envs, gamma=gamma)
    envs = VecPyTorch(envs, device)
    return envs
Пример #5
0
def Test():
    env=DummyVecEnv([EnvFunc(i) for i in range(1)])
    env = VecNormalize(env,ob=True, ret=True)
    act=ppo.learn(
        network="mlp",
        env=env,
        # lr=3e-4,
        nsteps=256,
        nminibatches=8,
        # lam=0.94,
        total_timesteps=0,
        log_interval=100,
        epsilon_start=0.9,
        epsilon_final=0.002,
        epsilon_decay=140,
        # save_interval=500,
         load_path="/home/duoyi/MyGit/simple_baselines/300",
        num_layers=3,
        num_hidden=256,
        value_network="copy"
    )
    obs=env.reset()
    iFrame=0

    while True:
        action,_,_,_=act.step(obs)
        obs,reward,done,info=env.step(action)
        iFrame+=1
        if done[0]:
            print("total Frame",iFrame)
            iFrame=0
        env.render()
Пример #6
0
def run_train_task(vv):

    ncpu = multiprocessing.cpu_count()
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()

    def make_env():
        env = vv['env'](log_scale_limit=0.0, max_path_length=vv['path_length'])
        env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)
        return env

    n_envs = vv['batch_size'] // vv['path_length']
    env = DummyVecEnv([make_env for i in range(n_envs)])
    env = VecNormalize(env)

    set_global_seeds(vv['seed'])
    policy = MlpPolicy
    model = ppo2.learn(policy=policy,
                       env=env,
                       nsteps=vv['path_length'],
                       nminibatches=25,
                       lam=0.95,
                       gamma=vv['discount'],
                       noptepochs=10,
                       log_interval=1,
                       ent_coef=0.0,
                       lr=3e-4,
                       cliprange=0.2,
                       total_timesteps=vv['total_timesteps'])
Пример #7
0
    def __call__(self, env_maker, seed=None, monitor_file=None):
        """
        :param env_maker: instance of roam_learning.robot_env.EnvMaker
        :param seed: int that is used to generate seeds for vectorized envs
        :param monitor_file: path to a .csv file to log episode rewards, lengths etc,. of the vectorized envs
        :return: instance of either DummyVecEnv, SubprocVecEnv or ShmemVecEnv
        """
        # Create a list of env makers
        if seed is not None:
            assert isinstance(seed, int)
        env_makers = []
        for i in range(self.nenvs):
            env_makers += [deepcopy(env_maker)]
            if seed is not None:
                seed = hash_seed(seed)
                env_makers[i].set_seed(seed + i)

        # Create the vectorized envs
        envs = self.vec_env_wrapper(env_makers)

        # Monitor the envs before normalization
        if monitor_file is not None:
            envs = VecMonitor(envs, filename=monitor_file)
        if self.normalize_obs or self.normalize_ret:
            envs = VecNormalize(envs, ob=self.normalize_obs, ret=self.normalize_ret, use_tf=True)
        return envs
Пример #8
0
    def __init__(self):

        # Create an instance of the network itself, as well as the memory.
        # Here is also a good place to set environmental parameters,
        # as well as training parameters - number of episodes / iterations, etc.

        args.log_dir = args.log_dir + args.env_name + '_' + args.algo
        try:
            os.makedirs(args.log_dir)
        except OSError:
            files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
            for f in files:
                os.remove(f)

        envs = [
            make_env(args.env_name, args.seed, i, args.log_dir)
            for i in range(args.num_processes)
        ]

        if args.num_processes > 1:
            envs = SubprocVecEnv(envs)
        else:
            envs = DummyVecEnv(envs)

        if len(envs.observation_space.shape) == 1:
            envs = VecNormalize(envs)

        self.environment_name = args.env_name
        self.agent = VecEnvAgent(envs, args)
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    agent = VecEnvAgent(envs, args)
    agent.train_maml(num_updates)
Пример #10
0
def train(num_timesteps, seed):
    ncpu = 1
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()

    def make_env():
        env = PointEnv()
        env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)
        return env

    env = DummyVecEnv([make_env])
    env = VecNormalize(env, ret=False, cliprew=200)

    set_global_seeds(seed)
    policy = MlpPolicy
    model = ppo2.learn(policy=policy,
                       env=env,
                       nsteps=100,
                       nminibatches=25,
                       lam=0.95,
                       gamma=0.99,
                       noptepochs=10,
                       log_interval=1,
                       ent_coef=0.0,
                       lr=3e-4,
                       cliprange=0.2,
                       total_timesteps=num_timesteps)

    return model, env
def train():
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=args.num_cpus,
                            inter_op_parallelism_threads=args.num_cpus)
    tf.Session(config=config).__enter__()

    env = RemoteVecEnv([create_env] * args.num_cpus)
    env = VecNormalize(env, ret=True, gamma=args.gamma)

    ppo2.learn(policy=policies.MlpPolicy,
               env=env,
               total_timesteps=args.num_timesteps,
               nminibatches=args.num_minibatches,
               nsteps=args.num_steps,
               noptepochs=args.num_epochs,
               lr=args.learning_rate,
               gamma=args.gamma,
               lam=args.lam,
               ent_coef=args.ent_coef,
               vf_coef=args.vf_coef,
               cliprange=args.clip_range,
               log_interval=args.log_interval,
               save_interval=args.save_interval,
               load_path=args.checkpoint_path,
               num_casks=args.num_casks)
Пример #12
0
def train(angle, num_timesteps, seed):
    from baselines.common import set_global_seeds
    from baselines.ppo2 import ppo2
    from baselines.ppo2.policies import MlpPolicy
    import gym
    import tensorflow as tf
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv

    with tf.Session() as sess:

        def make_env():
            return ant_env(angle)
            # env = gym.make('Ant-v1')
            # return env

        env = DummyVecEnv([make_env])
        env = VecNormalize(env)
        # env = ant_env(angle)

        set_global_seeds(seed)
        policy = MlpPolicy
        ppo2.learn(policy=policy,
                   env=env,
                   nsteps=2048,
                   nminibatches=32,
                   lam=0.95,
                   gamma=0.99,
                   noptepochs=10,
                   log_interval=10,
                   ent_coef=0.0,
                   lr=3e-4,
                   cliprange=0.2,
                   total_timesteps=num_timesteps)
Пример #13
0
def build_env(args):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    seed = args.seed

    env_type, env_id = get_env_type(args.env)

    if env_type in {'atari', 'retro'}:
        if alg == 'deepq':
            env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True})
        elif alg == 'trpo_mpi':
            env = make_env(env_id, env_type, seed=seed)
        else:
            frame_stack_size = 4
            env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale)
            env = VecFrameStack(env, frame_stack_size)

    else:
       config = tf.ConfigProto(allow_soft_placement=True,
                               intra_op_parallelism_threads=1,
                               inter_op_parallelism_threads=1)
       config.gpu_options.allow_growth = True
       get_session(config=config)

       env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale)

       if env_type == 'mujoco':
           env = VecNormalize(env)

    return env
Пример #14
0
def train(env_id, num_timesteps, seed):
    """
    Train PPO2 model for Mujoco environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    """
    def make_env():
        env_out = gym.make(env_id)
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy
    model = ppo2.learn(policy=policy,
                       env=env,
                       n_steps=2048,
                       nminibatches=32,
                       lam=0.95,
                       gamma=0.99,
                       noptepochs=10,
                       log_interval=1,
                       ent_coef=0.0,
                       learning_rate=3e-4,
                       cliprange=0.2,
                       total_timesteps=num_timesteps)

    return model, env
Пример #15
0
def build_env(args):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    seed = args.seed

    env_type, env_id = get_env_type(args.env)

    if env_type in {'atari', 'retro'}:
        if alg == 'deepq':
            env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True})
        elif alg == 'trpo_mpi':
            env = make_env(env_id, env_type, seed=seed)
        else:
            frame_stack_size = 4
            env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale)
            env = VecFrameStack(env, frame_stack_size)

    else:
       config = tf.ConfigProto(allow_soft_placement=True,
                               intra_op_parallelism_threads=1,
                               inter_op_parallelism_threads=1,
                               gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.20))
       config.gpu_options.allow_growth = True
       get_session(config=config)
       
       flatten_dict_observations = alg not in {'her'}
       env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale, flatten_dict_observations=flatten_dict_observations)
       
       normalize_value = args.normalize_value
       if (env_type == 'mujoco' or env_type=='roboschool') and normalize_value:
           env = VecNormalize(env)

    return env
Пример #16
0
def train(env_id, num_timesteps, seed):
    from baselines.common import set_global_seeds
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from baselines.ppo2 import ppo2
    from baselines.ppo2.policies import MlpPolicy
    import gym
    import tensorflow as tf
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
    ncpu = 1
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()
    def make_env():
        env = gym.make(env_id)
        env = bench.Monitor(env, logger.get_dir())
        return env
    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy
    ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32,
        lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
        ent_coef=0.0,
        lr=3e-4,
        cliprange=0.2,
        total_timesteps=num_timesteps)
Пример #17
0
def train(env_id, num_timesteps, seed, d_targ, load, point):
    from baselines.common import set_global_seeds
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from baselines.ppo2 import ppo2
    from baselines.ppo2.policies import LstmMlpPolicy, MlpPolicy
    import gym
    # import roboschool
    import multiprocessing
    import tensorflow as tf
    from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv

    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(
                env,
                logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            return env

        return _thunk

    set_global_seeds(seed)

    ncpu = multiprocessing.cpu_count()
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()

    nenvs = 32
    env = SubprocVecEnv([make_env(i) for i in range(nenvs)])
    env = VecNormalize(env)

    policy = MlpPolicy

    def adaptive_lr(lr, kl, d_targ):
        if kl < (d_targ / 1.5):
            lr *= 2.
        elif kl > (d_targ * 1.5):
            lr *= .5
        return lr

    ppo2.learn(policy=policy,
               env=env,
               nsteps=512,
               nminibatches=4,
               lam=0.95,
               gamma=0.99,
               noptepochs=15,
               log_interval=1,
               ent_coef=0.00,
               lr=adaptive_lr,
               cliprange=0.2,
               total_timesteps=num_timesteps,
               load=load,
               point=point,
               init_targ=d_targ)
Пример #18
0
def build_env(args, silent_monitor, prio_args=None):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    seed = args.seed

    env_type, env_id = get_env_type(args.env)

    if env_type in {'atari', 'retro'}:
        if alg == 'deepq':
            env = make_env(env_id,
                           env_type,
                           seed=seed,
                           wrapper_kwargs={'frame_stack': True})
        elif alg == 'trpo_mpi':
            env = make_env(env_id, env_type, seed=seed)
        else:
            frame_stack_size = 4
            env = make_vec_env(env_id,
                               env_type,
                               nenv,
                               seed,
                               gamestate=args.gamestate,
                               reward_scale=args.reward_scale,
                               prio_args=prio_args,
                               silent_monitor=silent_monitor)
            if prio_args is None:
                env = VecFrameStack(env, frame_stack_size)
            else:
                env = PrioVecFrameStack(env, frame_stack_size)

            # TODO prio vec frame stack

    else:
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=1,
                                inter_op_parallelism_threads=1)
        config.gpu_options.allow_growth = True
        get_session(config=config)

        num_env = args.n_active_envs if prio_args is None else args.num_env
        flatten_dict_observations = alg not in {'her'}
        env = make_vec_env(env_id,
                           env_type,
                           num_env or 1,
                           seed,
                           reward_scale=args.reward_scale,
                           flatten_dict_observations=flatten_dict_observations,
                           prio_args=prio_args,
                           silent_monitor=silent_monitor)

        if env_type == 'mujoco':
            if prio_args is None:
                env = VecNormalize(env)
            else:
                env = PrioVecNormalize(env)

    return env
def Eval():



    def EnvFunc(iSeed):
        def InnerFunc():
            oEnv=Env()
            return oEnv
        return InnerFunc

    def linear_schedule(initial_value):
        def func(process):
            return process * initial_value
        return func

    learning_rate = linear_schedule(5e-4)
    clip_range = linear_schedule(0.2)
    n_timesteps = int(0)
    hyperparmas = {'nsteps': 256, 'noptepochs': 8, 'nminibatches': 4, 'lr': learning_rate, 'cliprange': clip_range,
                   'vf_coef': 0.5, 'ent_coef': 0.01}


    num_env = 1
    env = SubprocVecEnv([EnvFunc(i) for i in range(num_env)])
    env = VecNormalize(env,ob=True,ret=False)
    env=VecMonitor(env)

    act = ppo2.learn(
        network="mlp",
        env=env,
        total_timesteps=n_timesteps,
        save_interval=100,
        load_path="baselineLog/ppobaseliens-2019-06-05-17-38-15-168854/checkpoints/00300",
        **hyperparmas,
        value_network="copy"
    )


    obs = env.reset()
    print("obs", obs.shape)
    bDone = False
    iFrame = 0
    iReward = 0
    reward_list=deque(maxlen=100)
    while not bDone:
        action = act.step(obs)[0]
        obs, reward, done, _ = env.step(action)
        iReward += reward[0]
        # time.sleep(0.01)
        # print("reward",reward)
        iFrame += 1
        # env.render()
        if done[0]:
            obs = env.reset()
            reward_list.append(iReward)
            print("done.................", iFrame, iReward,sum(reward_list)/len(reward_list))

            iFrame = 0
            iReward = 0
Пример #20
0
def build_env(args):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    seed = args.seed

    env_type, env_id = get_env_type(args.env)

    print(env_id)
    #extract the agc_env_name
    noskip_idx = env_id.find("NoFrameskip")
    env_name = env_id[:noskip_idx].lower()
    print("Env Name for Masking:", env_name)

    if env_type in {'atari', 'retro'}:
        if alg == 'deepq':
            env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True})
        elif alg == 'trpo_mpi':
            env = make_env(env_id, env_type, seed=seed)
        else:
            frame_stack_size = 4
            env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale)
            env = VecFrameStack(env, frame_stack_size)

    else:
       config = tf.ConfigProto(allow_soft_placement=True,
                               intra_op_parallelism_threads=1,
                               inter_op_parallelism_threads=1)
       config.gpu_options.allow_growth = True
       get_session(config=config)

       env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale)

    if args.custom_reward != '':
        from baselines.common.vec_env import VecEnv, VecEnvWrapper
        import baselines.common.custom_reward_wrapper as W
        assert isinstance(env,VecEnv) or isinstance(env,VecEnvWrapper)

        custom_reward_kwargs = eval(args.custom_reward_kwargs)

        if args.custom_reward == 'pytorch':
            if args.custom_reward_path == '':
                assert False, 'no path for reward model'
            else:
                env = W.VecPyTorchAtariReward(env, args.custom_reward_path, env_name)
        else:
            assert False, 'no such wrapper exist'

    if env_type == 'mujoco':
        env = VecNormalize(env)
    # if env_type == 'atari':
    #     input("Normalizing for ATari game: okay? [Enter]")
    #     #normalize rewards but not observations for atari
    #     env = VecNormalizeRewards(env)

    return env
Пример #21
0
    def make(seed):
        def make_env():
            env = gym.make(env_id)
            env.seed(seed)
            env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)
            return env

        env = DummyVecEnv([make_env])
        env = VecNormalize(env)
        return env
Пример #22
0
def build_env(args, seed):
    nenv = 1
    alg = args.alg
    # seed = args.seed
    seed = int(np.random.rand(1) * 101000)
    print(seed)

    env_type, env_id = get_env_type(args.env)
    set_global_seeds(seed)
    if env_type in {'atari', 'retro'}:
        if alg == 'deepq':
            env = make_env(env_id,
                           env_type,
                           seed=seed,
                           wrapper_kwargs={'frame_stack': True})
        elif alg == 'trpo_mpi':
            env = make_env(env_id, env_type, seed=seed)
        else:
            frame_stack_size = 4
            env = make_vec_env(env_id,
                               env_type,
                               nenv,
                               seed,
                               gamestate=args.gamestate,
                               reward_scale=args.reward_scale)
            env = VecFrameStack(env, frame_stack_size)

    else:
        # config = tf.ConfigProto(allow_soft_placement=True,
        #                        intra_op_parallelism_threads=1,
        #                        inter_op_parallelism_threads=1)
        # config.gpu_options.allow_growth = True
        # get_session(config=config)
        sess = tf.InteractiveSession()
        # env = VecNormalize(make_vec_env(env_id, env_type, 1, seed, reward_scale=args.reward_scale))

        env = make_vec_env(env_id,
                           env_type,
                           args.numenv,
                           seed,
                           reward_scale=args.reward_scale)
        evalenv = make_vec_env(env_id,
                               env_type,
                               args.numenv,
                               seed,
                               reward_scale=args.reward_scale)

        if env_type == 'mujoco':
            env = VecNormalize(env)
            evalenv = VecNormalizeEval(evalenv)
            evalenv.ob_rms = env.ob_rms
            evalenv.ret_rms = env.ret_rms

    return env, sess, evalenv
Пример #23
0
def train(env_id, num_timesteps, seed, nsteps, batch_size, epoch, method,
          net_size, i_trial, load_path, use_entr, ncpu):
    # rank = MPI.COMM_WORLD.Get_rank()
    # if rank != 0:
    #     logger.set_level(logger.DISABLED)

    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True

    # workerseed = seed + 10000 * rank
    tf.reset_default_graph()
    set_global_seeds(seed)

    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            if logger.get_dir():
                env = bench.Monitor(
                    env,
                    os.path.join(logger.get_dir(),
                                 'train-{}.monitor.json'.format(rank)))
            return env

        return _thunk

    # def make_env():
    #     env = gym.make(env_id)
    #     env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)
    #     return env

    env = SubprocVecEnv([make_env(i) for i in range(ncpu)])
    # env = DummyVecEnv([make_env])
    env = VecNormalize(env)
    with tf.Session(config=config) as sess:
        policy = MlpPolicy
        ppo2.learn(policy=policy,
                   env=env,
                   nsteps=nsteps,
                   nminibatches=batch_size,
                   lam=0.95,
                   gamma=0.99,
                   noptepochs=epoch,
                   log_interval=1,
                   ent_coef=0.01,
                   lr=3e-4,
                   cliprange=0.2,
                   total_timesteps=num_timesteps,
                   useentr=use_entr,
                   net_size=net_size,
                   i_trial=i_trial,
                   load_path=load_path,
                   method=method)
Пример #24
0
def train(env_id, num_timesteps, seed, pol, cur, vis, model):
    from baselines.common import set_global_seeds
    from baselines.ppo2 import ppo2
    from baselines.ppo2.policies import HierPolicy, HierPolicy2, MlpPolicy, RandomWalkPolicy
    import gym
    import gym_program
    import tensorflow as tf
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
    ncpu = 1
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()

    hier = True if pol == 'hier1' or pol == 'hier2' else False

    def make_env():
        set_global_seeds(seed)
        env = gym.make(env_id)
        env.set_curiosity(cur, model)
        env.set_hier(hier)
        env.set_visualize(vis)
        env = bench.Monitor(env, logger.get_dir())
        env.seed(seed)
        return env

    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)

    if pol == 'hier1': policy = HierPolicy
    elif pol == 'hier2': policy = HierPolicy2
    elif policy == 'mlp': policy = MlpPolicy
    elif pol == 'random_walk':
        pol = RandomWalkPolicy
        pol(env)
        return

    ppo2.learn(policy=policy,
               env=env,
               pol=pol,
               nsteps=2048,
               nminibatches=32,
               lam=0.95,
               gamma=0.99,
               noptepochs=10,
               log_interval=1,
               ent_coef=0.0,
               lr=1e-4,
               cliprange=0.2,
               total_timesteps=num_timesteps)
Пример #25
0
def get_env(env_name,
            no_normalize=False,
            out_dir="results",
            vector=8,
            reward_wrapper=lambda env: env):
    trained_agent = utils.get_trained_agent(env_name)

    ### ENV SETUP ###
    # TODO: upgrade Gym so this monkey-patch isn't needed
    gym.spaces.Dict = type(None)

    def make_env(id):
        # TODO: seed (not currently supported)
        # TODO: VecNormalize? (typically good for MuJoCo)
        # TODO: baselines logger?
        # TODO: we're loading identical policy weights into different
        # variables, this is to work-around design choice of Agent's
        # having state stored inside of them.
        sess = utils.make_session()
        with sess.as_default():
            multi_env, policy_type = utils.get_env_and_policy_type(env_name)
            multi_env = ShapeWeightHack(multi_env)
            single_env = MultiToSingle(
                DelayedLoadEnv(multi_env, trained_agent, policy_type,
                               "zoo_{}_policy_{}".format(env_name,
                                                         id), 0, sess))
            if env_name == 'kick-and-defend':
                #attacked_agent = utils.load_agent(trained_agent, policy_type,
                #                                  "zoo_{}_policy_{}".format(env_name, id), multi_env, 0)
                #single_env = MultiToSingle(CurryEnv(multi_env, attacked_agent))

                single_env = HackyFixForGoalie(single_env)

            single_env = reward_wrapper(single_env)

            single_env = Gymify(single_env)
            single_env.spec = gym.envs.registration.EnvSpec('Dummy-v0')

            # TODO: upgrade Gym so don't have to do thi0s
            single_env.observation_space.dtype = np.dtype(np.float32)

            single_env = Monitor(single_env,
                                 osp.join(out_dir, 'mon', 'log{}'.format(id)))
        return single_env
        # TODO: close session?

    venv = SubprocVecEnv(
        [functools.partial(make_env, i) for i in range(vector)])

    if not no_normalize:
        venv = VecNormalize(venv)

    return venv
    def create_environment(self):
        envs = [
            make_env(i, args, True, self.gan_file)
            for i in range(self.num_processes)
        ]
        envs = DummyVecEnv(envs)
        if len(envs.observation_space.shape) == 1:
            envs = VecNormalize(envs, gamma=args.gamma)

        obs_shape = envs.observation_space.shape
        obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
        return envs, obs_shape
Пример #27
0
def run_baselines(env, seed, log_dir):
    """Create baselines model and training.

    Args:
        env (dict): Environment of the task.
        seed (int): Random positive integer for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: Path to output csv file

    """
    ncpu = max(multiprocessing.cpu_count() // 2, 1)
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.compat.v1.Session(config=config).__enter__()

    # Set up logger for baselines
    configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard'])
    baselines_logger.info('rank {}: seed={}, logdir={}'.format(
        0, seed, baselines_logger.get_dir()))

    env = DummyVecEnv([
        lambda: bench.Monitor(
            env, baselines_logger.get_dir(), allow_early_resets=True)
    ])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy

    nbatch = env.num_envs * hyper_parameters['batch_size']
    training_batch_number = nbatch // hyper_parameters['training_batch_size']

    # import pdb; pdb.set_trace()

    # use AdamOptimizer as optimizer and choose value function same with policy
    ppo2.learn(policy=policy,
               env=env,
               nsteps=hyper_parameters['batch_size'],
               lam=hyper_parameters['gae_lambda'],
               gamma=hyper_parameters['discount'],
               ent_coef=hyper_parameters['policy_ent_coeff'],
               nminibatches=training_batch_number,
               noptepochs=hyper_parameters['training_epochs'],
               max_grad_norm=None,
               lr=hyper_parameters['learning_rate'],
               cliprange=hyper_parameters['lr_clip_range'],
               total_timesteps=hyper_parameters['batch_size'] * hyper_parameters['n_epochs'])  # yapf: disable  # noqa: E501

    return osp.join(log_dir, 'progress.csv')
Пример #28
0
def ppo():
    def make_env():
        env = SawyerEnvWrapper(DownEnv(for_her=False))
        return env

    tf.Session().__enter__()
    env = VecNormalize(DummyVecEnv([make_env]))
    policy = MlpPolicy
    model = ppo2.learn(policy=policy, env=env, nsteps=4000, nminibatches=1,
                       lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
                       ent_coef=0.0, lr=3e-4, cliprange=0.2, total_timesteps=1e8)

    return model
Пример #29
0
def train(env_id, num_timesteps, seed):
    from baselines.common import set_global_seeds
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from baselines.ppo2 import ppo2
    from baselines.ppo2.policies import MlpPolicy
    import gym
    import tensorflow as tf
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
    ncpu = 1
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()

    def make_env():
        if env_id == 'toy':
            #env = continuous_gridworld.ContinuousGridworld('', max_steps=1000,
            #                                           obstacle_mode=continuous_gridworld.NO_OBJECTS)
            from toy_environment import room_obstacle_list
            env = gridworld.Gridworld(
                obstacle_list_generator=room_obstacle_list.obstacle_list)
        elif env_id == 'navigate':
            env = NavigateEnv(use_camera=False,
                              continuous_actions=True,
                              neg_reward=True,
                              max_steps=500)
        elif env_id == 'arm2pos':
            #env = Arm2PosEnv(continuous=False, max_steps=500)
            pass
        else:
            env = gym.make(env_id)
        env = bench.Monitor(env, logger.get_dir())
        return env

    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy
    ppo2.learn(policy=policy,
               env=env,
               nsteps=2048,
               nminibatches=32,
               lam=0.95,
               gamma=0.99,
               noptepochs=10,
               log_interval=1,
               ent_coef=0.0,
               lr=3e-4,
               cliprange=0.2,
               total_timesteps=num_timesteps)
Пример #30
0
def train(env_id, num_timesteps, seed, lrschedule, num_env):
    def make_env():
        env = gym.make(env_id)
        env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)
        return env

    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy_fn = MlpPolicy

    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
    env.close()