Exemplo n.º 1
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env):
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
    env.close()
Exemplo n.º 2
0
def main():
    parser, args_default = arg_parser_common()
    args = parser.parse_args()

    import json
    from dotmap import DotMap
    from copy import copy, deepcopy
    keys_exclude = [
        'coef_predict_task', 'is_multiprocess', 'n_envs', 'eval_interval',
        'n_steps', 'n_minibatches', 'play', 'n_eval_epsiodes', 'force_write',
        'kl2clip_sharelogstd', 'policy_variance_state_dependent',
        'kl2clip_clip_clipratio', 'kl2clip_decay', 'lr', 'num_timesteps',
        'gradient_rectify', 'rectify_scale', 'kl2clip_clipcontroltype',
        'reward_scale', 'coef_predict_task', 'explore_additive_rate',
        'explore_additive_threshold', 'explore_timesteps', 'debug_halfcheetah',
        'name_project', 'n_opt_epochs', 'coef_entropy', 'log_interval',
        'save_interval', 'save_debug', 'isatari', 'env_full', 'envtype'
    ]
    keys_exclude.extend([
        'logstd', 'lam', 'hidden_sizes', 'num_layers', 'num_sharing_layers',
        'ac_fn', 'lam_decay', 'policy_type'
    ])
    # TODO: These args should not be used as name of dir only if they are specified.
    # TODO: Split args into..... group_keys and run_keys.

    #  -------------------- prepare args

    args.env_full = args.env
    args.env = args.env_full.split('-v')[0]

    if not args.isatari:
        args.envtype = MUJOCO
        if '-v' not in args.env_full:
            args.env_full = f'{args.env}-v2'
    else:
        keys_exclude.append('logstd')
        args.envtype = ATARI
        # if 'NoFrameskip' not in args.env:
        #     args.env = f''
        if '-v' not in args.env_full:
            args.env_full = f'{args.env}-v4'
    tools.warn_(f'Run with setting for {args.envtype} task!!!!!')

    assert bool(args.alg) != bool(
        args.cliptype), 'Either alg or cliptype should be specified'
    if args.alg:  # For release
        args.cliptype = alg2cliptype[args.alg]
        keys_exclude.append('cliptype')
        if len(args.keys_group) == 0:
            args.keys_group = ['alg']
        if args.name_group is None:
            args.name_group = ''
    else:  # For debug
        keys_exclude.append('alg')
        if len(args.keys_group) == 0:
            args.keys_group = ['cliptype', 'clipargs']
        if args.name_group is None:
            args.name_group = 'tmp'

    # ------ Set the values of args
    def update_dict(dictmain, dictnew):
        for key_arg in dictnew:
            if key_arg.startswith('__'):
                # This means that the value are customized for the specific values
                key_interest = key_arg[2:]  #e.g., __cliptype
                value_interest = dictmain[
                    key_interest]  #Search value from dictmain. e.g., kl_klrollback_constant_withratio
                if value_interest in dictnew[key_arg].keys():
                    dictmain = update_dict(dictmain,
                                           dictnew[key_arg][value_interest])
            else:
                if isinstance(dictnew[key_arg],
                              dict) and key_arg in dictmain.keys():
                    dictmain[key_arg].update(dictnew[key_arg])
                else:
                    dictmain[key_arg] = copy(dictnew[key_arg])
        return dictmain

    def reform_specific_dict(d):
        dictmain = dict((k, v) for k, v in d.items() if not k.startswith('__'))
        dictspecific = dict((k, v) for k, v in d.items() if k.startswith('__'))
        return update_dict(dictmain, dictspecific)

    # If the value of the following args are None, then it is setted by the following values
    keys_del = []
    args = vars(args)
    keys = list(args.keys())
    for key in keys:
        if args[key] is None:
            del args[key]  #Delete the value of args
            keys_del.append(key)
    if len(keys_del) > 0:
        print(
            'The following args are not provided value by the args. They will used built-in values.\n',
            ', '.join(keys_del))

    # args__ = update_dict( copy(args_default), args ) # We need to update the basic args, e.g., env, cliptype
    # args__  = reform_specific_dict( args__)
    # The following operations may seems strange. Maybe I will give a more clear one in the furture.
    args__ = update_dict(
        deepcopy(args),
        args_default)  # generate the default value from args_default
    args = update_dict(args__,
                       args)  # The priority of the customed value is highest
    for key in keys_del:  # make sure that keys_del are within args.keys()
        assert key in args.keys(), key
    # print( json.dumps(args, indent=True) )
    # exit()
    # TODO prepare_dir: change .finish_indicator to finishi_indictator, which is more clear.
    # --- prepare dir
    import baselines
    root_dir = tools_logger.get_logger_dir('baselines', 'results', baselines)
    args = tools_logger.prepare_dirs(args,
                                     key_first='env',
                                     keys_exclude=keys_exclude,
                                     dirs_type=['log'],
                                     root_dir=root_dir)
    # --- prepare args for use
    args.cliptype = ClipType[args.cliptype]

    args.zip_dirs = ['model', 'monitor']
    for d in args.zip_dirs:
        args[f'{d}_dir'] = osp.join(args.log_dir, d)
        os.mkdir(args[f'{d}_dir'])

    from baselines.common import set_global_seeds
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from baselines.ppo2_AdaClip import ppo2
    # from baselines.ppo2_AdaClip import ppo2_kl2clip_conservative as ppo2
    import baselines.ppo2_AdaClip.policies as plcs
    import gym
    import tensorflow as tf
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
    ncpu = 1
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True
    tf.Session(config=config).__enter__()

    set_global_seeds(args.seed)
    policy = getattr(plcs, args.policy_type)

    # ------ prepare env
    # args.eval_model = args.n_eval_epsiodes > 0
    if args.envtype == MUJOCO:

        def make_mujoco_env(rank=0):
            def _thunk():
                env = gym.make(args.env_full)
                env.seed(args.seed + rank)
                env = bench.Monitor(env,
                                    os.path.join(args.log_dir, 'monitor',
                                                 str(rank)),
                                    allow_early_resets=True)
                return env

            return _thunk

        if args.n_envs == 1:
            env = DummyVecEnv([make_mujoco_env()])
        else:
            from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
            env = SubprocVecEnv(
                [make_mujoco_env(i) for i in range(args.n_envs)])
        env = VecNormalize(env, reward_scale=args.reward_scale)

        env_test = None
        if args.n_eval_epsiodes > 0:
            if args.n_eval_epsiodes == 1:
                env_test = DummyVecEnv([make_mujoco_env()])
            else:
                from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
                env_test = SubprocVecEnv(
                    [make_mujoco_env(i) for i in range(args.n_eval_epsiodes)])
            env_test = VecNormalize(
                env_test, ret=False,
                update=False)  # It doesn't need to normalize return
    else:
        from baselines.common.vec_env.vec_frame_stack import VecFrameStack
        from baselines.common.cmd_util import make_atari_env
        env = VecFrameStack(
            make_atari_env(args.env_full, num_env=args.n_envs, seed=args.seed),
            4)
        env_test = None
        #  TODO : debug VecFrame
        if args.n_eval_epsiodes > 0:
            env_test = VecFrameStack(
                make_atari_env(args.env_full,
                               num_env=args.n_eval_epsiodes,
                               seed=args.seed), 4)
            # env_test.reset()
            # env_test.render()
    # ----------- learn
    if args.envtype == MUJOCO:
        lr = args.lr
        # cliprange = args.clipargs.cliprange
    elif args.envtype == ATARI:
        lr = lambda f: f * args.lr
        # cliprange = lambda f: f*args.clipargs.cliprange if args.clipargs.cliprange is not None else None
    # print('action_space',env.action_space)
    ppo2.learn(policy=policy,
               env=env,
               env_eval=env_test,
               n_steps=args.n_steps,
               nminibatches=args.n_minibatches,
               lam=args.lam,
               gamma=0.99,
               n_opt_epochs=args.n_opt_epochs,
               log_interval=args.log_interval,
               ent_coef=args.coef_entropy,
               lr=lr,
               total_timesteps=args.num_timesteps,
               cliptype=args.cliptype,
               save_interval=args.save_interval,
               args=args)

    tools_logger.finish_dir(args.log_dir)
Exemplo n.º 3
0
def learn(network,
          env,
          seed=None,
          nsteps=20,
          total_timesteps=int(80e6),
          q_coef=0.5,
          ent_coef=0.01,
          max_grad_norm=10,
          lr=7e-4,
          lrschedule='linear',
          rprop_epsilon=1e-5,
          rprop_alpha=0.99,
          gamma=0.99,
          log_interval=100,
          buffer_size=50000,
          replay_ratio=4,
          replay_start=10000,
          c=10.0,
          trust_region=True,
          alpha=0.99,
          delta=1,
          load_path=None,
          **network_kwargs):
    '''
    Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf)
    Train an agent with given network architecture on a given environment using ACER.

    Parameters:
    ----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies

    env:                environment. Needs to be vectorized for parallel environment simulation.
                        The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel) (default: 20)

    nstack:             int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension
                        (last image dimension) (default: 4)

    total_timesteps:    int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M)

    q_coef:             float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods)

    ent_coef:           float, policy entropy coefficient in the optimization objective (default: 0.01)

    max_grad_norm:      float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10),

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
                        returns fraction of the learning rate (specified as lr) as output

    rprop_epsilon:      float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)

    rprop_alpha:        float, RMSProp decay parameter (default: 0.99)

    gamma:              float, reward discounting factor (default: 0.99)

    log_interval:       int, number of updates between logging events (default: 100)

    buffer_size:        int, size of the replay buffer (default: 50k)

    replay_ratio:       int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4)

    replay_start:       int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k)

    c:                  float, importance weight clipping factor (default: 10)

    trust_region        bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size  (default: True)

    delta:              float, max KL divergence between the old policy and updated policy (default: 1)

    alpha:              float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99)

    load_path:          str, path to load the model from (default: None)

    **network_kwargs:               keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                    For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''

    print("Running Acer Simple")
    print(locals())
    set_global_seeds(seed)
    if not isinstance(env, VecFrameStack):
        env = VecFrameStack(env, 1)

    policy = build_policy(env, network, estimate_q=True, **network_kwargs)
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space

    nstack = env.nstack
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nenvs=nenvs,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  q_coef=q_coef,
                  gamma=gamma,
                  max_grad_norm=max_grad_norm,
                  lr=lr,
                  rprop_alpha=rprop_alpha,
                  rprop_epsilon=rprop_epsilon,
                  total_timesteps=total_timesteps,
                  lrschedule=lrschedule,
                  c=c,
                  trust_region=trust_region,
                  alpha=alpha,
                  delta=delta)

    runner = Runner(env=env, model=model, nsteps=nsteps)
    if replay_ratio > 0:
        buffer = Buffer(env=env, nsteps=nsteps, size=buffer_size)
    else:
        buffer = None
    nbatch = nenvs * nsteps
    acer = Acer(runner, model, buffer, log_interval)
    acer.tstart = time.time()

    for acer.steps in range(
            0, total_timesteps, nbatch
    ):  #nbatch samples, 1 on_policy call and multiple off-policy calls
        acer.call(on_policy=True)
        if replay_ratio > 0 and buffer.has_atleast(replay_start):
            n = np.random.poisson(replay_ratio)
            for _ in range(n):
                acer.call(on_policy=False)  # no simulation steps in this

    return model
Exemplo n.º 4
0
def train(env_id, num_timesteps, seed, num_cpu):
    env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
    policy_fn = CnnPolicy
    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu)
    env.close()
Exemplo n.º 5
0
# with tf.variable_scope("global"):
#   network = LSTMPolicy([42, 42, 1], 6)

# saver = tf.train.Saver()
# sess = tf.Session()

# saver.restore(sess, "/tmp/pong/train/model.ckpt-1894351")

tf.reset_default_graph()
sess = tf.Session()


class Actor():
    def __init__(self, ob_space, ac_space, n_batch, n_steps):
        self.network = CnnPolicy(sess, ob_space, ac_space, n_batch, n_steps)
        saver = tf.train.Saver()
        saver.restore(sess, "./checkpoints/model.ckpt")

    def act(self, state):
        stuff = self.network.step(state)
        action, value_, _ = stuff[0], stuff[1], stuff[2:]
        return action, value_


env = VecFrameStack(make_atari_env('PongNoFrameskip-v4', 1, 123), 4)
ob_space = env.observation_space
ac_space = env.action_space
actor = Actor(ob_space, ac_space, 32, 1)
with sess:
    print(actor.act(np.ones((32, 84, 84, 4))))
Exemplo n.º 6
0
from baselines.common.cmd_util import make_atari_env
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
from arguments import get_args
from a2c_agent import a2c_agent
from baselines import logger

if __name__ == '__main__':
    args = get_args()
    logger.configure(dir=args.log_dir)
    # create environments
    envs = VecFrameStack(make_atari_env(args.env_name, args.num_processes, args.seed), 4)
    trainer = a2c_agent(envs, args)
    trainer.learn()
    envs.close()
Exemplo n.º 7
0
from arguments import get_args
from ppo_agent import ppo_agent
from baselines.common.cmd_util import make_atari_env
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
from models import CNN_Net
from baselines import logger
import os

if __name__ == '__main__':
    args = get_args()
    if not os.path.exists('logs/'):
        os.mkdir('logs/')
    log_path = 'logs/' + args.env_name + '/'
    if not os.path.exists(log_path):
        os.mkdir(log_path)
    # write log information
    logger.configure(dir=log_path)
    envs = VecFrameStack(
        make_atari_env(args.env_name, args.num_workers, args.seed), 4)
    network = CNN_Net(envs.action_space.n)
    ppo_trainer = ppo_agent(envs, args, network, 'atari')
    ppo_trainer.learn()
Exemplo n.º 8
0
def main():
    parser, clipargs_default_all, args_default_all = arg_parser_common()
    args = parser.parse_args()

    import json
    from dotmap import DotMap
    keys_exclude = [
        'coef_predict_task', 'is_multiprocess', 'n_envs', 'eval_interval',
        'n_steps', 'n_minibatches', 'play', 'n_eval_epsiodes', 'force_write',
        'kl2clip_sharelogstd', 'policy_variance_state_dependent',
        'kl2clip_clip_clipratio', 'kl2clip_decay', 'lr', 'num_timesteps',
        'gradient_rectify', 'rectify_scale', 'kl2clip_clipcontroltype',
        'reward_scale', 'coef_predict_task', 'explore_additive_rate',
        'explore_additive_threshold', 'explore_timesteps', 'debug_halfcheetah',
        'name_project', 'env_pure', 'n_opt_epochs', 'coef_entropy',
        'log_interval', 'save_interval', 'save_debug', 'is_atari'
    ]
    # 'is_atari'

    #  -------------------- prepare args

    args.env_pure = args.env.split('-v')[0]

    # env_mujocos = 'InvertedPendulum,InvertedDoublePendulum,HalfCheetah,Hopper,Walker2d,Ant,Reacher,Swimmer,Humanoid'
    # env_mujocos = tools.str2list(env_mujocos)
    if not args.is_atari:
        env_type = MUJOCO
        if '-v' not in args.env:
            args.env = f'{args.env}-v2'
    else:
        env_type = ATARI
        if '-v' not in args.env:
            args.env = f'{args.env}-v4'
    tools.warn_(f'Run with setting for {env_type} task!!!!!')

    # --- set value of clipargs
    clipargs_default = clipargs_default_all[env_type]

    clipargs = clipargs_default[args.cliptype].copy()
    clipargs.update(args.clipargs)
    args.clipargs = clipargs

    # --- prepare other args
    # If the value of the following args are None, then it is setted by the following values
    args_default = args_default_all[env_type]
    args = DotMap(vars(args))
    print(
        "The followng arg value is None, thus they are setted by built-in value:"
    )

    for argname in args_default.keys():
        if args[argname] is None:
            if args.env_pure in args_default[argname].keys():
                args[argname] = args_default[argname][args.env_pure]
            else:
                args[argname] = args_default[argname]['_default']
            print(f"{argname}={args[argname]}")
    # print( json.dumps( args.toDict(), indent='\t') )
    # exit()
    # TODO prepare_dir: change .finish_indicator to finishi_indictator, which is more clear.
    # --- prepare dir
    import baselines
    root_dir = tools_logger.get_logger_dir('baselines', baselines, 'results')
    args = tools_logger.prepare_dirs(args,
                                     key_first='env',
                                     keys_exclude=keys_exclude,
                                     dirs_type=['log'],
                                     root_dir=root_dir)
    # --- prepare args for use
    args.cliptype = ClipType[args.cliptype]

    args.zip_dirs = ['model', 'monitor']
    for d in args.zip_dirs:
        args[f'{d}_dir'] = osp.join(args.log_dir, d)
        os.mkdir(args[f'{d}_dir'])

    from baselines.common import set_global_seeds
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from baselines.ppo2_AdaClip import ppo2
    # from baselines.ppo2_AdaClip import ppo2_kl2clip_conservative as ppo2
    import baselines.ppo2_AdaClip.policies as plcs
    import gym
    import tensorflow as tf
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
    ncpu = 1
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True
    tf.Session(config=config).__enter__()

    set_global_seeds(args.seed)
    policy = getattr(plcs, args.policy_type)

    # ------ prepare env
    # args.eval_model = args.n_eval_epsiodes > 0
    if env_type == MUJOCO:

        def make_mujoco_env(rank=0):
            def _thunk():
                env = gym.make(args.env)
                env.seed(args.seed + rank)
                env = bench.Monitor(env,
                                    os.path.join(args.log_dir, 'monitor',
                                                 str(rank)),
                                    allow_early_resets=True)
                return env

            return _thunk

        if args.n_envs == 1:
            env = DummyVecEnv([make_mujoco_env()])
        else:
            from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
            env = SubprocVecEnv(
                [make_mujoco_env(i) for i in range(args.n_envs)])
        env = VecNormalize(env, reward_scale=args.reward_scale)

        env_test = None
        if args.n_eval_epsiodes > 0:
            if args.n_eval_epsiodes == 1:
                env_test = DummyVecEnv([make_mujoco_env()])
            else:
                from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
                env_test = SubprocVecEnv(
                    [make_mujoco_env(i) for i in range(args.n_eval_epsiodes)])
            env_test = VecNormalize(
                env_test, ret=False,
                update=False)  # It doesn't need to normalize return
    else:
        from baselines.common.vec_env.vec_frame_stack import VecFrameStack
        from baselines.common.cmd_util import make_atari_env
        env = VecFrameStack(
            make_atari_env(args.env, num_env=args.n_envs, seed=args.seed), 4)
        env_test = None
        #  TODO : debug VecFrame
        if args.n_eval_epsiodes > 0:
            env_test = VecFrameStack(
                make_atari_env(args.env,
                               num_env=args.n_eval_epsiodes,
                               seed=args.seed), 4)
            # env_test.reset()
            # env_test.render()
    # ----------- learn
    if env_type == MUJOCO:
        lr = args.lr
        # cliprange = args.clipargs.cliprange
    elif env_type == ATARI:
        lr = lambda f: f * args.lr
        # cliprange = lambda f: f*args.clipargs.cliprange if args.clipargs.cliprange is not None else None
    args.env_type = env_type
    ppo2.learn(policy=policy,
               env=env,
               env_eval=env_test,
               n_steps=args.n_steps,
               nminibatches=args.n_minibatches,
               lam=args.lam,
               gamma=0.99,
               n_opt_epochs=args.n_opt_epochs,
               log_interval=args.log_interval,
               ent_coef=args.coef_entropy,
               lr=lr,
               total_timesteps=args.num_timesteps,
               cliptype=args.cliptype,
               save_interval=args.save_interval,
               args=args)

    tools_logger.finish_dir(args.log_dir)
Exemplo n.º 9
0
def build_env(args):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = args.seed

    env_type, env_id = get_env_type(args.env)

    if env_type == 'atari':
        if alg == 'acer':
            env = make_vec_env(env_id, env_type, nenv, seed)
        elif alg == 'deepq':
            env = atari_wrappers.make_atari(env_id)
            env.seed(seed)
            env = bench.Monitor(env, logger.get_dir())
            env = atari_wrappers.wrap_deepmind(env,
                                               frame_stack=True,
                                               scale=True)
        elif alg == 'trpo_mpi':
            env = atari_wrappers.make_atari(env_id)
            env.seed(seed)
            env = bench.Monitor(
                env,
                logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
            env = atari_wrappers.wrap_deepmind(env)
            # TODO check if the second seeding is necessary, and eventually remove
            env.seed(seed)
        else:
            frame_stack_size = 4
            env = VecFrameStack(make_vec_env(env_id, env_type, nenv, seed),
                                frame_stack_size)

    elif env_type == 'retro':
        import retro
        gamestate = args.gamestate or 'Level1-1'
        env = retro_wrappers.make_retro(
            game=args.env,
            state=gamestate,
            max_episode_steps=10000,
            use_restricted_actions=retro.Actions.DISCRETE)
        env.seed(args.seed)
        env = bench.Monitor(env, logger.get_dir())
        env = retro_wrappers.wrap_deepmind_retro(env)

    elif env_type == 'AirHockey':
        from gym_airhockey.configuration import configure_env
        from baselines.common.vec_env.dummy_vec_env import DummyVecEnv

        version_list = [x for x in args.versions if x is not None]
        version = version_list[
            MPI.COMM_WORLD.Get_rank() %
            len(version_list)]  # Each rank gets its own version

        # setup the environment
        env = gym.make(env_id)
        env.seed(args.seed)
        configure_env(env, version=version)

        # wrap the environment
        env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)
        env = DummyVecEnv([lambda: env])
        env.render()

    else:
        get_session(
            tf.ConfigProto(allow_soft_placement=True,
                           intra_op_parallelism_threads=1,
                           inter_op_parallelism_threads=1))

        env = make_vec_env(env_id,
                           env_type,
                           args.num_env or 1,
                           seed,
                           reward_scale=args.reward_scale)

        if env_type == 'mujoco':
            env = VecNormalize(env)

    return env
Exemplo n.º 10
0
Arquivo: run.py Projeto: T3p/baselines
def train(env, policy, policy_init, seed, njobs=1, **alg_args):

    if env.startswith('rllab.'):
        # Get env name and class
        env_name = re.match('rllab.(\S+)', env).group(1)
        env_rllab_class = rllab_env_from_name(env_name)

        # Define env maker
        def make_env(seed=0):
            def _thunk():
                env_rllab = Rllab2GymWrapper(env_rllab_class())
                env_rllab.seed(seed)
                return env_rllab

            return _thunk

        parallel_env = SubprocVecEnv(
            [make_env(seed + i * 100) for i in range(njobs)])
        # Used later
        env_type = 'rllab'
    else:
        # Normal gym, get if Atari or not.
        env_type = get_env_type(env)
        assert env_type is not None, "Env not recognized."
        # Define the correct env maker
        if env_type == 'atari':
            # Atari, custom env creation
            def make_env(seed=0):
                def _thunk():
                    _env = make_atari(env)
                    _env.seed(seed)
                    return wrap_deepmind(_env)

                return _thunk

            parallel_env = VecFrameStack(
                SubprocVecEnv([make_env(seed + i * 100)
                               for i in range(njobs)]), 4)
        else:
            # Not atari, standard env creation
            def make_env(seed=0):
                def _thunk():
                    _env = gym.make(env)
                    _env.seed(seed)
                    return _env

                return _thunk

            parallel_env = SubprocVecEnv(
                [make_env(seed + i * 100) for i in range(njobs)])

    if policy == 'linear':
        hid_size = num_hid_layers = 0
        use_bias = False
    elif policy == 'simple-nn':
        hid_size = [16]
        num_hid_layers = 1
        use_bias = True
    elif policy == 'nn':
        hid_size = [100, 50, 25]
        num_hid_layers = 3
        use_bias = True

    if policy_init == 'xavier':
        policy_initializer = tf.contrib.layers.xavier_initializer()
    elif policy_init == 'zeros':
        policy_initializer = U.normc_initializer(0.0)
    elif policy_init == 'small-weights':
        policy_initializer = U.normc_initializer(0.1)
    else:
        raise Exception('Unrecognized policy initializer.')

    if policy == 'linear' or policy == 'nn' or policy == 'simple-nn':

        def make_policy(name, ob_space, ac_space):
            return MlpPolicy(name=name,
                             ob_space=ob_space,
                             ac_space=ac_space,
                             hid_size=hid_size,
                             num_hid_layers=num_hid_layers,
                             gaussian_fixed_var=True,
                             use_bias=use_bias,
                             use_critic=False,
                             hidden_W_init=policy_initializer,
                             output_W_init=policy_initializer)
    elif policy == 'cnn':

        def make_policy(name, ob_space, ac_space):
            return CnnPolicy(name=name,
                             ob_space=ob_space,
                             ac_space=ac_space,
                             gaussian_fixed_var=True,
                             use_bias=False,
                             use_critic=False,
                             hidden_W_init=policy_initializer,
                             output_W_init=policy_initializer)
    else:
        raise Exception('Unrecognized policy type.')

    try:
        affinity = len(os.sched_getaffinity(0))
    except:
        affinity = njobs
    sess = U.make_session(affinity)
    sess.__enter__()

    set_global_seeds(seed)

    gym.logger.setLevel(logging.WARN)

    pois2.learn(parallel_env, make_policy, **alg_args)
Exemplo n.º 11
0
def build_env(args):

    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    seed = args.seed

    env_type, env_id = get_env_type(args.env)
    print("In the build_env function with alg :: ", alg)
    if env_type in {'atari', 'retro'}:
        if alg == 'deepq':
            env = make_env(env_id,
                           env_type,
                           seed=seed,
                           wrapper_kwargs={'frame_stack': True})
        elif alg == 'trpo_mpi':
            env = make_env(env_id, env_type, seed=seed)
        else:
            frame_stack_size = 4
            print(
                "make_vec_env arguments env_id {} , env_type {} , nenv {} ,seed {} , gamestate {} reward_scale {}"
                .format(env_id, env_type, nenv, seed, args.gamestate,
                        args.reward_scale))

            #>
            # print("Called environment for mean and std")
            # env = make_vec_env(env_id, env_type, 1, seed, gamestate=args.gamestate, reward_scale=args.reward_scale)
            # # env = VecFrameStack(env, frame_stack_size) ## No need for frame stacking while calculation of mean and std
            # ob_mean, ob_std = random_agent_ob_mean_std(env)
            # print(" environment complete with mean {} and std {}".format(ob_mean , ob_std))
            # del env
            #>

            env = make_vec_env(env_id,
                               env_type,
                               nenv,
                               seed,
                               gamestate=args.gamestate,
                               reward_scale=args.reward_scale)

            # print("Received env from make_vec_env  type env {} and env ".format(
            # type(env) , env))
            print("ob_space {} and ac_space {} ".format(
                env.observation_space, env.action_space))
            env = VecFrameStack(env, frame_stack_size)

            print("After Frame stacking env would become ")

    else:
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=1,
                                inter_op_parallelism_threads=1)
        config.gpu_options.allow_growth = True
        get_session(config=config)

        env = make_vec_env(env_id,
                           env_type,
                           args.num_env or 1,
                           seed,
                           reward_scale=args.reward_scale)

        if env_type == 'mujoco':
            env = VecNormalize(env)

    return env  #, ob_mean, ob_std
Exemplo n.º 12
0
def main(visualize=False):
    session = tf_util.make_session()
    env_model = EnvNetwork(action_space_size=6,
                           nbatch=num_env * singlestep,
                           K=K,
                           nsteps=singlestep,
                           reuse=False,
                           session=session)
    session.run(tf.global_variables_initializer())
    env_model.restore()

    env = VecFrameStack(make_doom_env(num_env, seed, 'mixed'), 4)
    navi_model = Model(policy=CnnPolicy,
                       ob_space=env.observation_space,
                       ac_space=Discrete(3),
                       nenvs=num_env,
                       nsteps=nsteps,
                       ent_coef=0.01,
                       vf_coef=0.5,
                       max_grad_norm=0.5,
                       lr=7e-4,
                       alpha=0.99,
                       epsilon=1e-5,
                       total_timesteps=total_timesteps,
                       lrschedule='linear',
                       model_name='navi')
    navi_model.load("O:\\Doom\\baselinemodel\\navigate_flat2.dat")

    fire_model = Model(policy=CnnPolicy,
                       ob_space=env.observation_space,
                       ac_space=Discrete(3),
                       nenvs=num_env,
                       nsteps=nsteps,
                       ent_coef=0.01,
                       vf_coef=0.5,
                       max_grad_norm=0.5,
                       lr=7e-4,
                       alpha=0.99,
                       epsilon=1e-5,
                       total_timesteps=total_timesteps,
                       lrschedule='linear',
                       model_name='fire')

    fire_model.load("O:\\Doom\\baselinemodel\\fire_flat2.dat")
    policy_model = MixedModel(navi_model, fire_model, check_enemy_leave,
                              check_enemy_enter, [0, 1, 4], [0, 1, 5])
    runner = Runner(env, policy_model, nsteps=nsteps, gamma=0.99)

    nh, nw, nc = env.observation_space.shape

    while True:
        total_loss = 0
        for _ in tqdm(range(save_freq)):
            obs1, _, _, mask1, actions1, _ = runner.run()

            obs1 = np.reshape(obs1, [num_env, nsteps, nh, nw, nc])
            obs1 = obs1[:, :, :, :, -1:]

            actions1 = np.reshape(actions1, [num_env, nsteps])
            mask1 = np.reshape(mask1, [num_env, nsteps])

            hidden_states = env_model.initial_state
            for s in range(0, nsteps - K - singlestep, singlestep):
                input_frames = obs1[:,
                                    s:s + singlestep, :, :, :] // norm_factor
                input_frames = np.reshape(input_frames,
                                          [num_env * singlestep, nh, nw])
                input_frames = np.eye(9)[input_frames]
                actions, masks, expected_observations = [], [], []
                for t in range(K):
                    expected_observation = obs1[:, s + t + 1:s + singlestep +
                                                t + 1, :, :, :]
                    expected_observation = np.reshape(
                        expected_observation,
                        [num_env * singlestep, nh, nw, 1])
                    expected_observations.append(expected_observation)

                    action = actions1[:, s + t:s + singlestep + t]
                    action = np.reshape(action, [num_env * singlestep])
                    actions.append(action)

                    mask = mask1[:, s + t:s + singlestep + t]
                    mask = np.reshape(mask, [num_env * singlestep])
                    masks.append(mask)

                if s > 0:
                    loss, prediction, hidden_states = env_model.train_and_predict(
                        input_frames, actions, masks, expected_observations,
                        hidden_states)
                    total_loss += loss
                else:
                    # warm up
                    prediction, hidden_states = env_model.predict(
                        input_frames, actions, masks, hidden_states)

                if visualize and s == 3 * singlestep:
                    for batch_idx in range(num_env * singlestep):
                        expected_t = expected_observations[0]
                        if np.sum(expected_t[batch_idx, :, :, :] > 0.0):
                            input_frame = input_frames[batch_idx, :, :, :]
                            cv2.imshow('input', input_frame)
                            for i in range(K):
                                time_t_expectation = expected_observations[i]
                                exp_obs = time_t_expectation[
                                    batch_idx, :, :, :]
                                cv2.imshow('expected for t+{}'.format(i + 1),
                                           exp_obs)
                            for i in range(K):
                                time_t_prediction = prediction[i]
                                cv2.imshow(
                                    'prediction for t+{}'.format(i + 1),
                                    time_t_prediction[batch_idx, :, :, 7])
                            cv2.waitKey(0)

        print("avg_loss = {}".format(total_loss / K / save_freq /
                                     valid_batch_size))
        env_model.save()
Exemplo n.º 13
0
def build_env(args):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = args.seed

    env_type, env_id = get_env_type(args.env)
    if env_type == 'mujoco':
        # todo: copy paste from akhil: create session instead of getting session
        get_session(
            tf.ConfigProto(allow_soft_placement=True,
                           intra_op_parallelism_threads=1,
                           inter_op_parallelism_threads=1))

        # always using dummy environment should allow running saved models without any further changes!
        # env = DummyVecEnv([lambda: make_mujoco_env(env_id, seed, args.reward_scale)])

        if args.num_env:
            env = SubprocVecEnv([
                lambda: make_mujoco_env(env_id, seed + i if seed is not None
                                        else None, args.reward_scale)
                for i in range(args.num_env)
            ])
        else:
            env = DummyVecEnv(
                [lambda: make_mujoco_env(env_id, seed, args.reward_scale)])

        # uncommented on Akhil's advice, as it is no longer necessary because I'm normalizing the data in my environment!
        env = VecNormalize(env)

    elif env_type == 'atari':
        if alg == 'acer':
            env = make_atari_env(env_id, nenv, seed)
        elif alg == 'deepq':
            env = atari_wrappers.make_atari(env_id)
            env.seed(seed)
            env = bench.Monitor(env, logger.get_dir())
            env = atari_wrappers.wrap_deepmind(env,
                                               frame_stack=True,
                                               scale=True)
        elif alg == 'trpo_mpi':
            env = atari_wrappers.make_atari(env_id)
            env.seed(seed)
            env = bench.Monitor(
                env,
                logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
            env = atari_wrappers.wrap_deepmind(env)
            # TODO check if the second seeding is necessary, and eventually remove
            env.seed(seed)
        else:
            frame_stack_size = 4
            env = VecFrameStack(make_atari_env(env_id, nenv, seed),
                                frame_stack_size)

    elif env_type == 'retro':
        import retro
        gamestate = args.gamestate or 'Level1-1'
        env = retro_wrappers.make_retro(
            game=args.env,
            state=gamestate,
            max_episode_steps=10000,
            use_restricted_actions=retro.Actions.DISCRETE)
        env.seed(args.seed)
        env = bench.Monitor(env, logger.get_dir())
        env = retro_wrappers.wrap_deepmind_retro(env)

    elif env_type == 'classic_control':

        def make_env():
            e = gym.make(env_id)
            e = bench.Monitor(e, logger.get_dir(), allow_early_resets=True)
            e.seed(seed)
            return e

        env = DummyVecEnv([make_env])

    else:
        raise ValueError('Unknown env_type {}'.format(env_type))

    return env
Exemplo n.º 14
0
def build_env(args):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    seed = args.seed

    env_type, env_id = get_env_type(args.env)

    print(env_id)
    #extract the agc_env_name
    noskip_idx = env_id.find("NoFrameskip")
    env_name = env_id[:noskip_idx].lower()
    print("Env Name for Masking:", env_name)

    if env_type in {'atari', 'retro'}:
        if alg == 'deepq':
            env = make_env(env_id,
                           env_type,
                           seed=seed,
                           wrapper_kwargs={'frame_stack': True})
        elif alg == 'trpo_mpi':
            env = make_env(env_id, env_type, seed=seed)
        else:
            frame_stack_size = 4
            env = make_vec_env(env_id,
                               env_type,
                               nenv,
                               seed,
                               gamestate=args.gamestate,
                               reward_scale=args.reward_scale)
            env = VecFrameStack(env, frame_stack_size)

    else:
        print("preconfig")
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=1,
                                inter_op_parallelism_threads=1)
        print("post config")
        config.gpu_options.allow_growth = True
        get_session(config=config)
        print("got session")
        env = make_vec_env(env_id,
                           env_type,
                           args.num_env or 1,
                           seed,
                           reward_scale=args.reward_scale)
        print("made env")
    if args.custom_reward != '':
        from baselines.common.vec_env import VecEnv, VecEnvWrapper
        import baselines.common.custom_reward_wrapper as W
        assert isinstance(env, VecEnv) or isinstance(env, VecEnvWrapper)

        custom_reward_kwargs = eval(args.custom_reward_kwargs)

        if args.custom_reward == 'live_long':
            env = W.VecLiveLongReward(env, **custom_reward_kwargs)
        elif args.custom_reward == 'random_tf':
            env = W.VecTFRandomReward(env, **custom_reward_kwargs)
        elif args.custom_reward == 'preference':
            env = W.VecTFPreferenceReward(env, **custom_reward_kwargs)
        elif args.custom_reward == 'rl_irl':
            if args.custom_reward_path == '':
                assert False, 'no path for reward model'
            else:
                if args.custom_reward_lambda == '':
                    assert False, 'no combination parameter lambda'
                else:
                    env = W.VecRLplusIRLAtariReward(env,
                                                    args.custom_reward_path,
                                                    args.custom_reward_lambda)
        elif args.custom_reward == 'pytorch':
            if args.custom_reward_path == '':
                assert False, 'no path for reward model'
            else:
                if env_type == "atari":
                    env = W.VecPyTorchAtariReward(env, args.custom_reward_path,
                                                  env_name)
                elif env_type == "mujoco":
                    env = W.VecPyTorchMujocoReward(env,
                                                   args.custom_reward_path,
                                                   env_name)
        elif args.custom_reward == "mcmc_mean":
            if args.custom_reward_path == '' or args.mcmc_chain_path == '':
                assert False, 'no path for reward model and/or chain_path'
            else:
                env = W.VecMCMCMeanAtariReward(env, args.custom_reward_path,
                                               args.mcmc_chain_path,
                                               args.embedding_dim, env_name)
        elif args.custom_reward == "mcmc_map":
            if args.custom_reward_path == '':
                assert False, 'no path for reward model and/or chain_path'
            else:
                env = W.VecMCMCMAPAtariReward(env, args.custom_reward_path,
                                              args.embedding_dim, env_name)
        else:
            assert False, 'no such wrapper exist'

    if env_type == 'mujoco':
        print("normalized environment")
        env = VecNormalize(env)
    # if env_type == 'atari':
    #     input("Normalizing for ATari game: okay? [Enter]")
    #     #normalize rewards but not observations for atari
    #     env = VecNormalizeRewards(env)

    return env
    extra_checkpoint_info = "bc_degredation"  #for finding checkpoint again


    hist_length = 4


    #env id, env type, num envs, and seed
    env = make_vec_env(env_id, 'atari', 1, seed,
                       wrapper_kwargs={
                           'clip_rewards':False,
                           'episode_life':False,
                       })

    stochastic = True
    env = VecFrameStack(env, 4)
    demonstrator = PPO2Agent(env, env_type, stochastic)

    ##generate demonstrations for use in BC
    demonstrations, learning_returns = generate_demos(env, env_name, demonstrator, args.checkpoint_path, args.num_demos)

    #Run BC on demos
    dataset_size = sum([len(d) for d in demonstrations])
    print("Data set size = ", dataset_size)


    episode_index_counter = 0
    num_data = 0
    action_set = set()
    action_cnt_dict = {}
    data = []
def build_env(args):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = args.seed

    env_type, env_id = get_env_type(args.env)

    if env_type == 'atari':
        if alg == 'acer':
            env = make_vec_env(env_id, env_type, nenv, seed)
        elif alg == 'deepq':
            env = atari_wrappers.make_atari(env_id)
            env.seed(seed)
            env = bench.Monitor(env, logger.get_dir())
            env = atari_wrappers.wrap_deepmind(env, frame_stack=True)
        elif alg == 'trpo_mpi':
            env = atari_wrappers.make_atari(env_id)
            env.seed(seed)
            env = bench.Monitor(
                env,
                logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
            env = atari_wrappers.wrap_deepmind(env)
            # TODO check if the second seeding is necessary, and eventually remove
            env.seed(seed)
        else:
            frame_stack_size = 4
            env = VecFrameStack(make_vec_env(env_id, env_type, nenv, seed),
                                frame_stack_size)

    elif env_type == 'retro':
        import retro
        gamestate = args.gamestate or retro.State.DEFAULT
        env = retro_wrappers.make_retro(
            game=args.env,
            state=gamestate,
            max_episode_steps=10000,
            use_restricted_actions=retro.Actions.DISCRETE)
        env.seed(args.seed)
        env = bench.Monitor(env, logger.get_dir())
        env = retro_wrappers.wrap_deepmind_retro(env)

    elif env_type == 'unity':
        get_session(tf.ConfigProto(allow_soft_placement=True))
        #    get_session(tf.ConfigProto(allow_soft_placement=True,
        #                                intra_op_parallelism_threads=1,
        #                                inter_op_parallelism_threads=1))
        env = make_multi_unity_vec_env(env_id,
                                       env_type,
                                       args.num_env or 1,
                                       seed,
                                       reward_scale=args.reward_scale)
        env = VecNormalize(env)

    else:
        get_session(
            tf.ConfigProto(allow_soft_placement=True,
                           intra_op_parallelism_threads=1,
                           inter_op_parallelism_threads=1))

        env = make_vec_env(env_id,
                           env_type,
                           args.num_env or 1,
                           seed,
                           reward_scale=args.reward_scale)

        if env_type == 'mujoco' or env_type == 'unity':
            env = VecNormalize(env)

    return env
Exemplo n.º 17
0
def build_env(args):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = args.seed

    env_type, env_id = get_env_type(args.env)
    if env_type == 'mujoco':
        get_session(
            tf.ConfigProto(allow_soft_placement=True,
                           intra_op_parallelism_threads=1,
                           inter_op_parallelism_threads=1))

        if args.num_env:
            env = make_vec_env(env_id,
                               env_type,
                               nenv,
                               seed,
                               reward_scale=args.reward_scale)
        else:
            env = make_vec_env(env_id,
                               env_type,
                               1,
                               seed,
                               reward_scale=args.reward_scale)

        env = VecNormalize(env)

    elif env_type == 'atari':
        if alg == 'acer':
            env = make_vec_env(env_id, env_type, nenv, seed)
        elif alg == 'deepq':
            env = atari_wrappers.make_atari(env_id)
            env.seed(seed)
            env = bench.Monitor(env, logger.get_dir())
            env = atari_wrappers.wrap_deepmind(env,
                                               frame_stack=True,
                                               scale=True)
        elif alg == 'trpo_mpi':
            env = atari_wrappers.make_atari(env_id)
            env.seed(seed)
            env = bench.Monitor(
                env,
                logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
            env = atari_wrappers.wrap_deepmind(env)
            # TODO check if the second seeding is necessary, and eventually remove
            env.seed(seed)
        else:
            frame_stack_size = 4
            env = VecFrameStack(make_vec_env(env_id, env_type, nenv, seed),
                                frame_stack_size)

    elif env_type == 'retro':
        import retro
        gamestate = args.gamestate or 'Level1-1'
        env = retro_wrappers.make_retro(
            game=args.env,
            state=gamestate,
            max_episode_steps=10000,
            use_restricted_actions=retro.Actions.DISCRETE)
        env.seed(args.seed)
        env = bench.Monitor(env, logger.get_dir())
        env = retro_wrappers.wrap_deepmind_retro(env)

    elif env_type == 'classic_control':

        def make_env():
            e = gym.make(env_id)
            e = bench.Monitor(e, logger.get_dir(), allow_early_resets=True)
            e.seed(seed)
            return e

        env = DummyVecEnv([make_env])

    else:
        raise ValueError('Unknown env_type {}'.format(env_type))

    return env
Exemplo n.º 18
0
def build_env(args, selector=None):
    global store
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = args.seed

    env_type, env_id = get_env_type(args.env)
    print(env_type, env_id, nenv, args.num_env)
    if env_type == 'mujoco':
        get_session(
            tf.ConfigProto(allow_soft_placement=True,
                           intra_op_parallelism_threads=1,
                           inter_op_parallelism_threads=1))

        if args.num_env:
            env = SubprocVecEnv([
                lambda: make_mujoco_env(env_id, seed + i if seed is not None
                                        else None, args.reward_scale)
                for i in range(args.num_env)
            ])
        else:
            env = DummyVecEnv(
                [lambda: make_mujoco_env(env_id, seed, args.reward_scale)])

        env = VecNormalize(env)

    elif env_type == 'atari':
        if alg == 'acer':
            env = make_atari_env(
                env_id, nenv, seed)  #, wrapper_kwargs={'clip_rewards': False})
        elif alg == 'deepq':
            env = atari_wrappers.make_atari(env_id)
            env.seed(seed)
            env = bench.Monitor(env, logger.get_dir())
            env = atari_wrappers.wrap_deepmind(env,
                                               frame_stack=True,
                                               scale=True)
        elif alg == 'trpo_mpi':
            env = atari_wrappers.make_atari(env_id)
            env.seed(seed)
            env = bench.Monitor(
                env,
                logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
            env = atari_wrappers.wrap_deepmind(env)
            # TODO check if the second seeding is necessary, and eventually remove
            env.seed(seed)
        elif "Zelda" in env_id:
            sys.path.append(
                "/home/jupyter/Notebooks/Chang/HardRLWithYoutube/nnrunner/a2c_gvgai"
            )
            import nnrunner.a2c_gvgai.env as gvgai_env
            frame_stack_size = 4
            print("run zelda")
            env = VecFrameStack(
                gvgai_env.make_gvgai_env(env_id,
                                         nenv,
                                         seed,
                                         level_selector=selector,
                                         experiment="PE",
                                         dataset="zelda"), frame_stack_size)
            # env.reset()
            # store = env
        else:
            frame_stack_size = 4
            env = VecFrameStack(make_atari_env(env_id, nenv, seed),
                                frame_stack_size)

    elif env_type == 'retro':
        import retro
        gamestate = args.gamestate or 'Level1-1'
        env = retro_wrappers.make_retro(
            game=args.env,
            state=gamestate,
            max_episode_steps=10000,
            use_restricted_actions=retro.Actions.DISCRETE)
        env.seed(args.seed)
        env = bench.Monitor(env, logger.get_dir())
        env = retro_wrappers.wrap_deepmind_retro(env)

    elif env_type == 'classic_control':

        def make_env():
            e = gym.make(env_id)
            e = bench.Monitor(e, logger.get_dir(), allow_early_resets=True)
            e.seed(seed)
            return e

        env = DummyVecEnv([make_env])

    else:
        raise ValueError('Unknown env_type {}'.format(env_type))

    # env.reset()
    print("build env")
    # store.reset()
    # store.reset()

    return env