def train_trpo(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=32, num_hid_layers=2)

    set_global_seeds(workerseed)
    env = gym.make(env_id)
    env.seed(workerseed)

    #timesteps_per_batch=1024
    timesteps_per_batch=2048
    
    #trpo_mpi.learn(network='mlp', env=env, total_timesteps=num_timesteps, timesteps_per_batch=timesteps_per_batch,
 #                  max_kl=0.01, cg_iters=10, cg_damping=0.1, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3,seed=workerseed,
#                   num_layers=2, num_hidden=32)

    trpo_mpi.learn(network='mlp',env=env,seed=workerseed,total_timesteps=num_timesteps)
    env.close()
示例#2
0
def train(env, seed, policy_fn, reward_giver, dataset, algo,
          g_step, d_step, policy_entcoeff, num_timesteps, save_per_iter,
          checkpoint_dir, pretrained, BC_max_iter, task_name=None):

    pretrained_weight = None
    if pretrained and (BC_max_iter > 0):
        # Pretrain with behavior cloning
        from baselines.gail import behavior_clone
        pretrained_weight = behavior_clone.learn(env, policy_fn, dataset,
                                                 max_iters=BC_max_iter)

    if algo == 'trpo':
        from baselines.gail import trpo_mpi
        # Set up for MPI seed
        rank = MPI.COMM_WORLD.Get_rank()
        if rank != 0:
            logger.set_level(logger.DISABLED)
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
        set_global_seeds(workerseed)
        env.seed(workerseed)
        trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank,
                       pretrained=pretrained, pretrained_weight=pretrained_weight,
                       g_step=g_step, d_step=d_step,
                       entcoeff=policy_entcoeff,
                       max_timesteps=num_timesteps,
                       ckpt_dir=checkpoint_dir,
                       save_per_iter=save_per_iter,
                       timesteps_per_batch=1024,
                       max_kl=0.01, cg_iters=10, cg_damping=0.1,
                       gamma=0.99, lam=0.97, #0.995 as default
                       vf_iters=5, vf_stepsize=1e-3,
                       task_name=task_name)
    else:
        raise NotImplementedError
示例#3
0
def train(num_timesteps):

    env = GRID(grid_size=36, square_size=4, stochastic=True)
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)

    def policy_fn(name, ob_space, ac_space):
        return CnnPolicy(name=name,
                         ob_space=env.observation_space,
                         ac_space=env.action_space)

    trpo_mpi.learn(env,
                   policy_fn,
                   timesteps_per_batch=1024,
                   max_kl=0.01,
                   cg_iters=10,
                   cg_damping=0.1,
                   max_timesteps=num_timesteps,
                   gamma=0.99,
                   lam=0.98,
                   vf_iters=5,
                   vf_stepsize=1e-3)
    env.close()
示例#4
0
def train(env,
          seed,
          policy_entcoeff,
          num_timesteps,
          num_iters,
          checkpoint_dir,
          gamma,
          task_name=None):

    from baselines.trpo_mpi import trpo_mpi
    # Set up for MPI seed
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env.seed(workerseed)
    trpo_mpi.learn(network=args.network,
                   env=env,
                   total_timesteps=num_timesteps,
                   ent_coef=policy_entcoeff,
                   max_iters=num_iters,
                   ckpt_dir=checkpoint_dir,
                   timesteps_per_batch=args.batchsize,
                   max_kl=args.max_kl,
                   cg_iters=args.cg_iters,
                   cg_damping=args.cg_damping,
                   gamma=gamma,
                   lam=0.97,
                   vf_iters=args.vf_iters,
                   vf_stepsize=args.vf_stepsize,
                   task_name=task_name,
                   num_layers=args.policy_hidden_layer,
                   num_hidden=args.policy_hidden_size)
示例#5
0
def train(env_id, num_frames, seed):
    from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
    from baselines.trpo_mpi import trpo_mpi
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank != 0:
        logger.set_level(logger.DISABLED)


    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space)
    env = bench.Monitor(env, logger.get_dir() and 
        osp.join(logger.get_dir(), "%i.monitor.json"%rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    num_timesteps = int(num_frames / 4 * 1.1)
    env.seed(workerseed)

    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3,
        max_timesteps=num_timesteps, gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00)
    env.close()
示例#6
0
def train(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    env = make_mujoco_env(env_id, workerseed)
    trpo_mpi.learn(env,
                   policy_fn,
                   timesteps_per_batch=1024,
                   max_kl=0.01,
                   cg_iters=10,
                   cg_damping=0.1,
                   max_timesteps=num_timesteps,
                   gamma=0.99,
                   lam=0.98,
                   vf_iters=5,
                   vf_stepsize=1e-3)
    env.close()
示例#7
0
def train(env, seed, policy_fn, reward_giver, dataset,
          g_step, d_step, policy_entcoeff, num_timesteps,
          checkpoint_dir, pretrained, BC_max_iter, gamma, rnd_iter, dyn_norm, task_name=None):

    pretrained_weight = None
    if pretrained and (BC_max_iter > 0):
        # Pretrain with behavior cloning
        from baselines.rnd_gail.behavior_clone import learn as bc_learn
        pretrained_weight = bc_learn(env, policy_fn, dataset, task_name, max_iters=BC_max_iter, ckpt_dir=checkpoint_dir)


    from baselines.rnd_gail import trpo_mpi
    # Set up for MPI seed
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env.seed(workerseed)
    trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank,
                   pretrained=pretrained, pretrained_weight=pretrained_weight,
                   g_step=g_step, d_step=d_step,
                   entcoeff=policy_entcoeff,
                   max_timesteps=num_timesteps,
                   ckpt_dir=checkpoint_dir,
                   timesteps_per_batch=1024,
                   max_kl=args.max_kl, cg_iters=10, cg_damping=0.1,
                   gamma=gamma, lam=0.97,
                   vf_iters=5, vf_stepsize=1e-3,
                   task_name=task_name, rnd_iter=rnd_iter, dyn_norm=dyn_norm, mmd=args.reward==2)
示例#8
0
def train(env_id, num_timesteps, seed, flight_log_dir, ckpt_dir, model_ckpt_path):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)
    workerseed = seed + 1000000 * rank
    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=32, num_hid_layers=2)
    flight_log = FlightLog(flight_log_dir)
    env = gym.make(env_id)
    env.seed(workerseed)
    set_global_seeds(workerseed)
    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
        max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5,
        vf_stepsize=1e-3,
            flight_log = flight_log,
            ckpt_dir = ckpt_dir,
            model_ckpt_path = model_ckpt_path
            )
    env.close()
示例#9
0
def train(env_id, num_frames, seed):
    from baselines.ppo1 import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank != 0: logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
    env = bench.Monitor(env, logger.get_dir() and 
        osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    num_timesteps = int(num_frames / 4 * 1.1)
    env.seed(workerseed)

    pposgd_simple.learn(env, policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_batch=256,
        clip_param=0.2, entcoeff=0.01,
        optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
        gamma=0.99, lam=0.95,
        schedule='linear'
    )
    env.close()
示例#10
0
def train(env, seed, policy_fn, reward_giver, dataset, algo,
          g_step, d_step, policy_entcoeff, num_timesteps, save_per_iter,
          checkpoint_dir, log_dir, pretrained, BC_max_iter, task_name=None):

    pretrained_weight = None
    if pretrained and (BC_max_iter > 0):
        # Pretrain with behavior cloning
        from baselines.gail import behavior_clone
        pretrained_weight = behavior_clone.learn(env, policy_fn, dataset,
                                                 max_iters=BC_max_iter)

    if algo == 'trpo':
        from baselines.gail import trpo_mpi
        # Set up for MPI seed
        rank = MPI.COMM_WORLD.Get_rank()
        if rank != 0:
            logger.set_level(logger.DISABLED)
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
        set_global_seeds(workerseed)
        env.seed(workerseed)
        trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank,
                       pretrained=pretrained, pretrained_weight=pretrained_weight,
                       g_step=g_step, d_step=d_step,
                       entcoeff=policy_entcoeff,
                       max_timesteps=num_timesteps,
                       ckpt_dir=checkpoint_dir, log_dir=log_dir,
                       save_per_iter=save_per_iter,
                       timesteps_per_batch=1024,
                       max_kl=0.01, cg_iters=10, cg_damping=0.1,
                       gamma=0.995, lam=0.97,
                       vf_iters=5, vf_stepsize=1e-3,
                       task_name=task_name)
    else:
        raise NotImplementedError
示例#11
0
def train(env_id, num_frames, seed):
    from baselines.ppo1 import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()

    if rank != 0: logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * rank
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    env.seed(seed)

    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
    env = bench.Monitor(env, osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    num_timesteps = int(num_frames / 4 * 1.1)
    env.seed(workerseed)

    pposgd_simple.learn(env, policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_batch=256,
        clip_param=0.2, entcoeff=0.01,
        optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
        gamma=0.99, lam=0.95,
        schedule='linear'
    )
    env.close()
示例#12
0
def train(env_id, seed):
    from baselines.ppo1 import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank != 0: logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
    env = bench.Monitor(env, logger.get_dir() and 
        osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    env.seed(workerseed)

    task_name = "ppo." + args.env.split("-")[0] + "." + ("%.2f"%args.entcoeff)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    pposgd_simple.learn(env, policy_fn, 
        max_timesteps=args.num_timesteps,
        timesteps_per_batch=256,
        clip_param=0.2, entcoeff=args.entcoeff,
        optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
        gamma=0.99, lam=0.95,
        sample_stochastic=args.sample_stochastic, task_name=task_name, save_per_iter=args.save_per_iter,
        ckpt_dir=args.checkpoint_dir, load_model_path=args.load_model_path, task=args.task)
    env.close()
示例#13
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    # TODO: Change back to 1e6
    
    memory = Memory(limit=int(1e2), state_shape=env.state_space.shape, action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()


    kwargs.pop('state_shape')
    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
        action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)

    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
示例#14
0
def train(env,
          num_timesteps,
          seed,
          ckpt_dir=None,
          render=False,
          ckpt_freq=0,
          restore_dir=None,
          optim_stepsize=3e-4,
          schedule="linear",
          gamma=0.99,
          optim_epochs=10,
          optim_batchsize=64,
          horizon=2048):

    from baselines.common.fc_learning_utils import FlightLog
    from mpi4py import MPI
    from baselines import logger
    from baselines.ppo1.mlp_policy import MlpPolicy
    from baselines.common import set_global_seeds
    from baselines.ppo1 import pposgd_simple
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    logger.set_level(logger.DISABLED)
    workerseed = seed + 1000000 * rank

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    if render:
        env.render()
    env.seed(workerseed)
    set_global_seeds(workerseed)
    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_actorbatch=horizon,
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=optim_epochs,
                        optim_stepsize=optim_stepsize,
                        optim_batchsize=optim_batchsize,
                        gamma=0.99,
                        lam=0.95,
                        schedule=schedule,
                        flight_log=None,
                        ckpt_dir=ckpt_dir,
                        restore_dir=restore_dir,
                        save_timestep_period=ckpt_freq)
    env.close()
示例#15
0
    def train(self, env, nb_steps):
        # Configure things.
        rank = MPI.COMM_WORLD.Get_rank()
        if rank != 0:
            logger.set_level(logger.DISABLED)

        # Parse noise_type
        action_noise = None
        param_noise = None
        nb_actions = env.action_space.shape[-1]
        param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(0.2),
                                             desired_action_stddev=float(0.2))

        # Configure components.
        memory = Memory(limit=int(1e6),
                        action_shape=env.action_space.shape,
                        observation_shape=env.observation_space.shape)
        critic = Critic(layer_norm=True)
        actor = Actor(nb_actions, layer_norm=True)

        # Seed everything to make things reproducible.
        seed = self.seed + 1000000 * rank
        logger.info('rank {}: seed={}, logdir={}'.format(
            rank, seed, logger.get_dir()))
        tf.reset_default_graph()
        set_global_seeds(seed)
        env.seed(seed)

        # Disable logging for rank != 0 to avoid noise.
        if rank == 0:
            start_time = time.time()
        #load_state("D:\project\osim-rl-helper\ddpg.pkl")
        training.train(env=env,
                       param_noise=param_noise,
                       restore=True,
                       action_noise=action_noise,
                       actor=actor,
                       critic=critic,
                       memory=memory,
                       nb_epochs=1,
                       nb_epoch_cycles=1,
                       render_eval=False,
                       reward_scale=1.0,
                       render=False,
                       normalize_returns=False,
                       normalize_observations=True,
                       critic_l2_reg=1e-2,
                       actor_lr=1e-4,
                       critic_lr=1e-3,
                       popart=False,
                       gamma=0.99,
                       clip_norm=None,
                       nb_train_steps=nb_steps,
                       nb_rollout_steps=5,
                       nb_eval_steps=5,
                       batch_size=64)
        #save_state("D:\project\osim-rl-helper\ddpg.pkl")

        if rank == 0:
            logger.info('total runtime: {}s'.format(time.time() - start_time))
示例#16
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):

    # Only rank 0 worker to report results
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs
    env = gym.make(env_id)
    env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    # If evaluation on DDPG is enabled, create new environment
    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'))

    # Parse noise type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]  # Example: converts (4,) to 4

    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initia)
def run(seed, noise_type, layer_norm, **kwargs):
    """Configure things."""
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0: logger.set_level(logger.DISABLED)
    """Create Simulation envs."""
    # env = PegintoHoles()
    """Create True envs"""
    env = Env_robot_control()
    nb_actions = env.action_dim
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))
    """Parse noise_type"""
    action_noise = None
    param_noise = None
    """Configure components."""
    memory = Memory(limit=int(1e5),
                    action_shape=env.action_dim,
                    observation_shape=env.state_dim)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)
    """Seed everything to make things reproducible."""
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    """Disable logging to avoid noise."""
    start_time = time.time()
    """Evaluate the result"""
    Test(env=env,
         param_noise=param_noise,
         action_noise=action_noise,
         actor=actor,
         critic=critic,
         memory=memory,
         **kwargs)
    """Eval the result"""

    logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    logger.configure(dir='/home/vaisakhs_shaj/Desktop/DeepReinforcementLearning/5_Deep_Deterministic_Policy_Gradients/LOGS/OSIM')
    # Create envs.
    env = ProstheticsEnv(visualize=True)
    env.change_model(model = '2D', difficulty = 0, prosthetic = True, seed=seed)
        #env.seed(seed)
    #env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    
    eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(2e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 2000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
        action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
示例#19
0
def train(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

    # Create a new base directory like /tmp/openai-2018-05-21-12-27-22-552435
    log_dir = os.path.join(
        energyplus_logbase_dir(),
        datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
    if not os.path.exists(log_dir + '/output'):
        os.makedirs(log_dir + '/output')
    os.environ["ENERGYPLUS_LOG"] = log_dir
    model = os.getenv('ENERGYPLUS_MODEL')
    if model is None:
        print('Environment variable ENERGYPLUS_MODEL is not defined')
        os.exit()
    weather = os.getenv('ENERGYPLUS_WEATHER')
    if weather is None:
        print('Environment variable ENERGYPLUS_WEATHER is not defined')
        os.exit()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        print('train: init logger with dir={}'.format(log_dir))  #XXX
        logger.configure(log_dir)
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)

    env = make_energyplus_env(env_id, workerseed)

    ac = env.action_space.sample()
    ob = env.reset()
    ac = np.array([-0.8, -0.8, 1.0, 1.0])
    for iter in range(num_timesteps):

        if ob[1] > 23.6:
            ac[0] -= 0.01
            ac[2] += 0.05
        if ob[1] < 23.4:
            ac[0] += 0.01
            ac[2] -= 0.05

        if ob[2] > 23.6:
            ac[1] -= 0.01
            ac[3] += 0.05
        if ob[2] < 23.4:
            ac[1] += 0.01
            ac[3] -= 0.05

        ob, rew, done, _ = env.step(ac)

        #print(ob)

        if done:
            ob = env.reset()

    env.close()
示例#20
0
def train(env_id,
          num_timesteps,
          seed,
          learn=trpo_mpi.learn,
          policy_fn_class=MlpPolicy):
    import baselines.common.tf_util as U

    def policy_fn(name, ob_space, ac_space):
        return policy_fn_class(name=name,
                               ob_space=ob_space,
                               ac_space=ac_space,
                               hid_size=32,
                               num_hid_layers=2)

    sess = U.single_threaded_session()
    sess.__enter__()
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

    # Create a new base directory like /tmp/openai-2018-05-21-12-27-22-552435
    log_dir = os.path.join(
        energyplus_logbase_dir(),
        datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
    if not os.path.exists(log_dir + '/output'):
        os.makedirs(log_dir + '/output')
    os.environ["ENERGYPLUS_LOG"] = log_dir
    model = os.getenv('ENERGYPLUS_MODEL')
    if model is None:
        print('Environment variable ENERGYPLUS_MODEL is not defined')
        return
    weather = os.getenv('ENERGYPLUS_WEATHER')
    if weather is None:
        print('Environment variable ENERGYPLUS_WEATHER is not defined')
        return

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        print('train: init logger with dir={}'.format(log_dir))  # XXX
        logger.configure(log_dir)
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)

    env = make_energyplus_env(env_id, workerseed)

    learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        # timesteps_per_batch=1*1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
        timesteps_per_batch=16 * 1024,
        max_kl=0.01,
        cg_iters=10,
        cg_damping=0.1,
        gamma=0.99,
        lam=0.98,
        vf_iters=5,
        vf_stepsize=1e-3)
    env.close()
示例#21
0
def run(cfg, seed, noise_type, layer_norm, evaluation, architecture, **kwargs):    
   
    if MPI.COMM_WORLD.Get_rank() == 0:
        dir_path = os.path.dirname(os.path.realpath(__file__))
        logger.configure(dir_path, ['stdout'])
        
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = GRLEnv(cfg)
    gym.logger.setLevel(logging.WARN)
    env = MyMonitor(env, os.path.join(logger.get_dir(), kwargs['output']))

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev, theta = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), dt=0.03,
                                                        sigma=float(stddev) * np.ones(nb_actions), 
                                                        theta=float(theta) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = MyCritic(layer_norm=layer_norm, architecture=architecture)
    actor = MyActor(nb_actions, layer_norm=layer_norm, architecture=architecture)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env, param_noise=param_noise, action_noise=action_noise,
                   actor=actor, critic=critic, memory=memory, **kwargs)
    env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
示例#22
0
def train(env_id, rank, environment_args, stacked_obs, num_hidden_units,
          max_iters, checkpoint_dir, log_dir, timesteps_per_batch, render,
          seed):

    sess = U.single_threaded_session()
    sess.__enter__()

    if rank == 0:
        logger.configure()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    if environment_args is not None:
        try:
            env.unwrapped.set_environment_config(environment_args)
        except:
            print("Can't set the configuration to the environment!")

    if rank == 0:
        with open(osp.join(checkpoint_dir, "args.txt"), "a") as f:
            f.write("\nEnvironment argument:\n")
            for k, v in env.unwrapped._config.items():
                f.write("{}: {}\n".format(k, v))

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=env.observation_space,
                         ac_space=env.action_space,
                         hid_size=num_hidden_units,
                         num_hid_layers=2)

    env = bench.Monitor(env,
                        logger.get_dir()
                        and osp.join(logger.get_dir(), str(rank)),
                        allow_early_resets=True)

    # Support the stacked the frames
    env = FrameStack_Mujoco(env, stacked_obs)
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    trpo_mpi.learn(env,
                   policy_fn,
                   checkpoint_dir,
                   log_dir,
                   render=render,
                   timesteps_per_batch=timesteps_per_batch,
                   max_kl=0.01,
                   cg_iters=10,
                   cg_damping=0.1,
                   max_iters=max_iters,
                   gamma=0.99,
                   lam=0.98,
                   vf_iters=5,
                   vf_stepsize=1e-3)
    env.close()
示例#23
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    """
    tf_config = tf.ConfigProto(
        inter_op_parallelism_threads=1,
        intra_op_parallelism_threads=1)
    tf_config.gpu_options.allow_growth = True
    tf.Session(config=tf_config).__enter__()
    """
    U.make_session(num_cpu=1).__enter__()
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0: logger.set_level(logger.DISABLED)
    #U.make_session(num_cpu=1).__enter__()
    workerseed = seed + MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    if args.env.lower() == "learntorun":
        from learntorun_env import LearnToRunEnv
        env = LearnToRunEnv(difficulty=0)
    else:
        env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and
        osp.join(logger.get_dir(), "{}.monitor.json".format(rank)))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    pposgd_simple.learn(env, policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_batch=args.timesteps_per_batch,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=args.epochs, optim_stepsize=3e-4, optim_batchsize=args.optim_batchsize,
            gamma=0.99, lam=0.95, schedule=args.schedule,
        )

    """
    pposgd_simple.learn(env, policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_batch=4096,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=512,
            gamma=0.99, lam=0.95, schedule='adapt', desired_kl=0.02,
        )
    """

    """
    # specifically for humanoid
    pposgd_simple.learn(env, policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_batch=512,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=15, optim_stepsize=3e-4, optim_batchsize=4096,
            gamma=0.99, lam=0.95, schedule='adapt', # add adapt
        )
    """
    env.close()
示例#24
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):

    param_noise = None
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    # env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    nb_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                sigma=np.ones(nb_actions))

    # Configure components.
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    actor = Actor(nb_actions, layer_norm=layer_norm)
    critic = Critic(layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
示例#25
0
文件: main.py 项目: o7s8r6/gail-tf
def main(args):
    from baselines.ppo1 import mlp_policy
    U.make_session(num_cpu=args.num_cpu).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)
    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            reuse=reuse, hid_size=64, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and
        osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)
    dataset = Mujoco_Dset(expert_path=args.expert_path, ret_threshold=args.ret_threshold, traj_limitation=args.traj_limitation)
    pretrained_weight = None
    if args.pretrained:
        # Pretrain with behavior cloning
        from algo import behavior_clone
        pretrained_weight = behavior_clone.learn(env, policy_fn, dataset,
            max_iters=args.BC_max_iter,
            ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir)

    from network.adversary import TransitionClassifier
    # discriminator
    discriminator = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
    if args.algo == 'trpo':
        # Set up for MPI seed
        from mpi4py import MPI
        rank = MPI.COMM_WORLD.Get_rank()
        if rank != 0:
            logger.set_level(logger.DISABLED)
        workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank()
        set_global_seeds(workerseed)
        env.seed(workerseed)
        from algo import trpo_mpi
        if args.task == 'train':
            trpo_mpi.learn(env, policy_fn, discriminator, dataset,
                pretrained=args.pretrained, pretrained_weight=pretrained_weight,
                g_step=args.g_step, d_step=args.d_step,
                timesteps_per_batch=1024, 
                max_kl=args.max_kl, cg_iters=10, cg_damping=0.1,
                max_timesteps=args.num_timesteps, 
                entcoeff=args.policy_entcoeff, gamma=0.995, lam=0.97, 
                vf_iters=5, vf_stepsize=1e-3,
                ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir,
                save_per_iter=args.save_per_iter, load_model_path=args.load_model_path,
                task_name=task_name)
        elif args.task == 'evaluate':
            trpo_mpi.evaluate(env, policy_fn, args.load_model_path, timesteps_per_batch=1024,
                number_trajs=10, stocahstic_policy=args.stocahstic_policy)
        else: raise NotImplementedError
    else: raise NotImplementedError

    env.close()
def train_copos(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

    def policy_fn(name, ob_space, ac_space):
        return CompatibleMlpPolicy(name=name,
                                   ob_space=ob_space,
                                   ac_space=ac_space,
                                   hid_size=32,
                                   num_hid_layers=2)

    set_global_seeds(workerseed)
    env = gym.make(env_id)
    env.seed(workerseed)

    timesteps_per_batch = 1024
    beta = -1
    if beta < 0:
        nr_episodes = num_timesteps // timesteps_per_batch
        # Automatically compute beta based on initial entropy and number of iterations
        tmp_pi = policy_fn("tmp_pi", env.observation_space, env.action_space)

        sess.run(tf.global_variables_initializer())

        tmp_ob = np.zeros((1, ) + env.observation_space.shape)
        entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.ob: tmp_ob})
        beta = 2 * entropy / nr_episodes
        print("Initial entropy: " + str(entropy) + ", episodes: " +
              str(nr_episodes))
        print("Automatically set beta: " + str(beta))

    copos_mpi.learn(env,
                    policy_fn,
                    timesteps_per_batch=timesteps_per_batch,
                    epsilon=0.01,
                    beta=beta,
                    cg_iters=10,
                    cg_damping=0.1,
                    max_timesteps=num_timesteps,
                    gamma=0.99,
                    lam=0.98,
                    vf_iters=5,
                    vf_stepsize=1e-3)
    env.close()
示例#27
0
def start_experiment(**args):
    make_env = partial(make_env_all_params, args=args)
    logger.set_level(logger.DEBUG)

    trainer = Trainer(make_env=make_env,
                      num_timesteps=int(1e8),
                      envs_per_process=N_THREADS)  #TODO
    log, tf_sess = get_experiment_environment(**args)
    with log, tf_sess:
        logdir = logger.get_dir()
        print("results will be saved to ", logdir)
        trainer.train()
示例#28
0
def train(env,
          seed,
          writer,
          policy_fn,
          med_fn,
          dataset,
          g_step,
          m_step,
          e_step,
          inner_iters,
          pi_stepsize,
          med_stepsize,
          num_timesteps,
          save_per_iter,
          checkpoint_dir,
          log_dir,
          pretrained,
          BC_max_iter,
          task_name=None):
    pretrained_weight = None
    if pretrained and (BC_max_iter > 0):
        from baselines.gail import behavior_clone
        pretrained_weight = behavior_clone.learn(env,
                                                 policy_fn,
                                                 dataset,
                                                 max_iters=BC_max_iter)
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env.seed(workerseed)

    learner.learn(env,
                  policy_fn,
                  med_fn,
                  dataset,
                  pretrained,
                  pretrained_weight,
                  g_step,
                  m_step,
                  e_step,
                  inner_iters,
                  save_per_iter,
                  checkpoint_dir,
                  log_dir,
                  med_stepsize=med_stepsize,
                  pi_stepsize=pi_stepsize,
                  max_timesteps=num_timesteps,
                  timesteps_per_batch=1024,
                  task_name=task_name,
                  writer=writer)
示例#29
0
def createEnv(env_id='CartPole-v1', seed=0):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    return make_mujoco_env(env_id, workerseed)
示例#30
0
def run(cfg, num_timesteps, seed, hid_size, **kwargs):
    dir_path = os.path.dirname(os.path.realpath(__file__))
    logger.configure(dir_path, ['stdout'])

    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = GRLEnv(cfg)
    env.set_test(False)

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=env.observation_space,
                         ac_space=env.action_space,
                         hid_size=hid_size,
                         num_hid_layers=2)

    env = MyMonitor(env,
                    osp.join(logger.get_dir(), kwargs['output']),
                    report='learn')
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    if kwargs['evaluation']:
        trpo_mpi.play(sess,
                      env,
                      policy_fn,
                      timesteps_per_batch=1024,
                      load_file=kwargs['load_file'])
    else:
        trpo_mpi.learn(sess,
                       env,
                       policy_fn,
                       timesteps_per_batch=1024,
                       max_kl=0.01,
                       cg_iters=10,
                       cg_damping=0.1,
                       max_timesteps=num_timesteps,
                       gamma=0.99,
                       lam=0.98,
                       vf_iters=5,
                       vf_stepsize=1e-3,
                       **kwargs)

    env.close()
示例#31
0
def train(env_id, seed):
    from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
    from baselines.trpo_mpi import trpo_mpi
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):  #pylint: disable=W0613
        return CnnPolicy(name=name,
                         ob_space=env.observation_space,
                         ac_space=env.action_space)

    env = bench.Monitor(
        env,
        logger.get_dir()
        and osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    env.seed(workerseed)

    task_name = "trpo." + args.env.split("-")[0] + "." + ("%.2f" %
                                                          args.entcoeff)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    trpo_mpi.learn(env,
                   policy_fn,
                   timesteps_per_batch=512,
                   max_kl=0.001,
                   cg_iters=10,
                   cg_damping=1e-3,
                   max_timesteps=args.num_timesteps,
                   gamma=0.98,
                   lam=1.0,
                   vf_iters=3,
                   vf_stepsize=1e-4,
                   entcoeff=args.entcoeff,
                   sample_stochastic=args.sample_stochastic,
                   task_name=task_name,
                   save_per_iter=args.save_per_iter,
                   ckpt_dir=args.checkpoint_dir,
                   load_model_path=args.load_model_path,
                   task=args.task)
    env.close()
示例#32
0
def evaluate(env_id, environment_args, stacked_obs, num_hidden_units,
             load_model_path, timesteps_per_batch, video_prefix, render,
             record, seed, info_list):

    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)

    if environment_args is not None:
        try:
            env.unwrapped.set_environment_config(environment_args)
        except:
            print("Can't set the configuration to the environment!")

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=env.observation_space,
                         ac_space=env.action_space,
                         hid_size=num_hidden_units,
                         num_hid_layers=2)

    env = bench.Monitor(env,
                        logger.get_dir()
                        and osp.join(logger.get_dir(), str(rank)),
                        allow_early_resets=True)

    # Support the stacked the frames
    env = FrameStack_Mujoco(env, stacked_obs)
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    trpo_mpi.evaluate(
        env,
        policy_fn,
        timesteps_per_batch,
        load_model_path,
        video_prefix,
        record=record,
        render=render,
        info_list=info_list,
        gamma=0.99,
        lam=0.98,
    )
    env.close()
示例#33
0
def train(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=32, num_hid_layers=2)
    env = make_mujoco_env(env_id, workerseed)
    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
        max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
    env.close()
示例#34
0
def train(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space,
            hid_size=32, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and 
        osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
        max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
    env.close()
示例#35
0
def run(env_id, seed, noise_type, num_cpu, layer_norm, logdir, gym_monitor, evaluation, bind_to_core, **kwargs):
    kwargs['logdir'] = logdir
    whoami = mpi_fork(num_cpu, bind_to_core=bind_to_core)
    if whoami == 'parent':
        sys.exit(0)

    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        # Write to temp directory for all non-master workers.
        actual_dir = None
        Logger.CURRENT.close()
        Logger.CURRENT = Logger(dir=mkdtemp(), output_formats=[])
        logger.set_level(logger.DISABLED)
    
    # Create envs.
    if rank == 0:
        env = gym.make(env_id)
        if gym_monitor and logdir:
            env = gym.wrappers.Monitor(env, os.path.join(logdir, 'gym_train'), force=True)
        env = SimpleMonitor(env)

        if evaluation:
            eval_env = gym.make(env_id)
            if gym_monitor and logdir:
                eval_env = gym.wrappers.Monitor(eval_env, os.path.join(logdir, 'gym_eval'), force=True)
            eval_env = SimpleMonitor(eval_env)
        else:
            eval_env = None
    else:
        env = gym.make(env_id)
        if evaluation:
            eval_env = gym.make(env_id)
        else:
            eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
        action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    Logger.CURRENT.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))