Пример #1
0
def train(env, seed, policy_fn, reward_giver, dataset, algo,
          g_step, d_step, policy_entcoeff, num_timesteps, save_per_iter,
          checkpoint_dir, pretrained, BC_max_iter, task_name=None):

    pretrained_weight = None
    if pretrained and (BC_max_iter > 0):
        # Pretrain with behavior cloning
        from baselines.gail import behavior_clone
        pretrained_weight = behavior_clone.learn(env, policy_fn, dataset,
                                                 max_iters=BC_max_iter)

    if algo == 'trpo':
        from baselines.gail import trpo_mpi
        # Set up for MPI seed
        rank = MPI.COMM_WORLD.Get_rank()
        if rank != 0:
            logger.set_level(logger.DISABLED)
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
        set_global_seeds(workerseed)
        env.seed(workerseed)
        trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank,
                       pretrained=pretrained, pretrained_weight=pretrained_weight,
                       g_step=g_step, d_step=d_step,
                       entcoeff=policy_entcoeff,
                       max_timesteps=num_timesteps,
                       ckpt_dir=checkpoint_dir,
                       save_per_iter=save_per_iter,
                       timesteps_per_batch=1024,
                       max_kl=0.01, cg_iters=10, cg_damping=0.1,
                       gamma=0.99, lam=0.97, #0.995 as default
                       vf_iters=5, vf_stepsize=1e-3,
                       task_name=task_name)
    else:
        raise NotImplementedError
Пример #2
0
def train(env, seed, policy_fn, reward_giver, dataset, algo,
          g_step, d_step, policy_entcoeff, num_timesteps, save_per_iter,
          checkpoint_dir, log_dir, pretrained, BC_max_iter, task_name=None):

    pretrained_weight = None
    if pretrained and (BC_max_iter > 0):
        # Pretrain with behavior cloning
        from baselines.gail import behavior_clone
        pretrained_weight = behavior_clone.learn(env, policy_fn, dataset,
                                                 max_iters=BC_max_iter)

    if algo == 'trpo':
        from baselines.gail import trpo_mpi
        # Set up for MPI seed
        rank = MPI.COMM_WORLD.Get_rank()
        if rank != 0:
            logger.set_level(logger.DISABLED)
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
        set_global_seeds(workerseed)
        env.seed(workerseed)
        trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank,
                       pretrained=pretrained, pretrained_weight=pretrained_weight,
                       g_step=g_step, d_step=d_step,
                       entcoeff=policy_entcoeff,
                       max_timesteps=num_timesteps,
                       ckpt_dir=checkpoint_dir, log_dir=log_dir,
                       save_per_iter=save_per_iter,
                       timesteps_per_batch=1024,
                       max_kl=0.01, cg_iters=10, cg_damping=0.1,
                       gamma=0.995, lam=0.97,
                       vf_iters=5, vf_stepsize=1e-3,
                       task_name=task_name)
    else:
        raise NotImplementedError
Пример #3
0
def irl(env, trajectories, discount, seed, log_dir, *,
        tf_cfg, policy_cfg=None, gan_cfg=None, train_cfg=None):
    dataset = _make_dset(trajectories)

    train_graph = tf.Graph()
    with train_graph.as_default():
        tf.set_random_seed(seed)

        policy_fn = _policy_factory(policy_cfg)

        gan_kwargs = {'hidden_size': 100}
        if gan_cfg is not None:
            gan_kwargs.update(gan_cfg)
        reward_giver = TransitionClassifier(env, **gan_kwargs)

        train_kwargs = {
            'pretrained': False,
            'BC_max_iter': 10000,
            'g_step': 3, # number of steps to train policy in each epoch
            'd_step': 1, # number of steps to train discriminator in each epoch
            'entcoeff': 0, # entropy coefficiency of policy
            'max_timesteps': 5e6, # number of timesteps per episode
            'timesteps_per_batch': 1024,
            'max_kl': 0.01,
            'cg_iters': 10,
            'cg_damping': 0.1,
            'lam': 0.97,
            'vf_iters': 5,
            'vf_stepsize': 1e-3,
        }
        if train_cfg is not None:
            train_kwargs.update(train_cfg)

        pretrained_weight = None
        bc_max_iter = train_kwargs.pop('BC_max_iter')
        if train_kwargs['pretrained']:
            # Pretrain with behavior cloning
            pretrained_weight = behavior_clone.learn(env, policy_fn, dataset,
                                         max_iters=bc_max_iter)
        ckpt_dir = osp.join(log_dir, 'checkpoints')

        with tf.Session(config=tf_cfg) as sess:
            trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank=0,
                           pretrained_weight=pretrained_weight,
                           ckpt_dir=ckpt_dir, log_dir=log_dir,
                           gamma=discount, save_per_iter=100,
                           task_name='gail', **train_kwargs)

            policy_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'pi')
            policy_serialised = sess.run(policy_vars)

    return None, policy_serialised
Пример #4
0
def train(env, seed, policy_fn, reward_giver, dataset, algo,
          g_step, d_step, policy_entcoeff, num_timesteps, save_per_iter,
          checkpoint_dir, log_dir, pretrained, BC_max_iter, task_name=None):

    pretrained_weight = None
    if pretrained and (BC_max_iter > 0):
        # Pretrain with behavior cloning
        from baselines.gail import behavior_clone
        pretrained_weight = behavior_clone.learn(env, policy_fn, dataset,
                                                 max_iters=BC_max_iter)

    # These are initialized to the same thing always so good
    #logger.log("all positions: \n", env.reset() ) # print the object positions
    #logger.log("all positions: \n", env.reset() ) # print the object positions to see if same
    
    if algo == 'trpo':
        from baselines.gail import trpo_mpi
        # Set up for MPI seed
        rank = MPI.COMM_WORLD.Get_rank()
        if rank != 0:
            logger.set_level(logger.DISABLED)
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
        set_global_seeds(workerseed)
        #env.seed(workerseed) # removed since SawyerLift doesnt have seed

        # Adjustin trpo stuff
        trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank,
                       pretrained=pretrained, pretrained_weight=pretrained_weight,
                       g_step=g_step, d_step=d_step,
                       entcoeff=policy_entcoeff,
                       max_timesteps=num_timesteps,
                       ckpt_dir=checkpoint_dir, log_dir=log_dir,
                       save_per_iter=save_per_iter,
                       timesteps_per_batch=15000, # changed b=timesteps per batch for scaled env from 10000
                       max_kl=0.001, cg_iters=50, cg_damping=0.1, # maxkl was 0.01, cg iters was 10, cg_dampening from 0.1 
                       gamma=0.995, lam=0.97, # originally 0.97
                       vf_iters=5, vf_stepsize=1e-3,
                       task_name=task_name)
    else:
        raise NotImplementedError
Пример #5
0
def train(env,
          seed,
          policy_fn,
          reward_giver,
          dataset,
          algo,
          g_step,
          d_step,
          policy_entcoeff,
          num_timesteps,
          save_per_iter,
          checkpoint_dir,
          pretrained,
          bc_max_iter,
          task_name=None):
    """
    train gail on mujoco

    :param env: (Gym Environment) the environment
    :param seed: (int) the initial random seed
    :param policy_fn: (function (str, Gym Space, Gym Space, bool): MLPPolicy) policy generator
    :param reward_giver: (TransitionClassifier) the reward predicter from obsevation and action
    :param dataset: (MujocoDset) the dataset manager
    :param algo: (str) the algorithm type (only 'trpo' is supported)
    :param g_step: (int) number of steps to train policy in each epoch
    :param d_step: (int) number of steps to train discriminator in each epoch
    :param policy_entcoeff: (float) the weight of the entropy loss for the policy
    :param num_timesteps: (int) the number of timesteps to run
    :param save_per_iter: (int) the number of iterations before saving
    :param checkpoint_dir: (str) the location for saving checkpoints
    :param pretrained: (bool) use a pretrained behavior clone
    :param bc_max_iter: (int) the maximum number of training iterations for the behavior clone
    :param task_name: (str) the name of the task (can be None)
    """

    pretrained_weight = None
    if pretrained and (bc_max_iter > 0):
        # Pretrain with behavior cloning
        pretrained_weight = behavior_clone.learn(env,
                                                 policy_fn,
                                                 dataset,
                                                 max_iters=bc_max_iter)

    if algo == 'trpo':
        # Set up for MPI seed
        rank = MPI.COMM_WORLD.Get_rank()
        if rank != 0:
            logger.set_level(logger.DISABLED)
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
        set_global_seeds(workerseed)
        env.seed(workerseed)
        trpo_mpi.learn(env,
                       policy_fn,
                       timesteps_per_batch=1024,
                       max_kl=0.01,
                       cg_iters=10,
                       gamma=0.995,
                       lam=0.97,
                       entcoeff=policy_entcoeff,
                       cg_damping=0.1,
                       vf_stepsize=1e-3,
                       vf_iters=5,
                       max_timesteps=num_timesteps,
                       pretrained_weight=pretrained_weight,
                       reward_giver=reward_giver,
                       expert_dataset=dataset,
                       rank=rank,
                       save_per_iter=save_per_iter,
                       ckpt_dir=checkpoint_dir,
                       g_step=g_step,
                       d_step=d_step,
                       task_name=task_name)
    else:
        raise NotImplementedError
Пример #6
0
def train(env,
          seed,
          policy_fn,
          reward_giver,
          dataset,
          algo,
          g_step,
          d_step,
          policy_entcoeff,
          num_timesteps,
          save_per_iter,
          checkpoint_dir,
          log_dir,
          pretrained,
          BC_max_iter,
          rew_lambda,
          mix_reward=False,
          task_name=None,
          frame_stack=1):

    pretrained_weight = None
    if pretrained and (BC_max_iter > 0):
        # Pretrain with behavior cloning
        from baselines.gail import behavior_clone
        pretrained_weight = behavior_clone.learn(env,
                                                 policy_fn,
                                                 dataset,
                                                 max_iters=BC_max_iter)

    # Set up for MPI seed
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env.seed(workerseed)

    if algo == 'trpo':
        from baselines.gail import trpo_mpi
        trpo_mpi.learn(env,
                       policy_fn,
                       reward_giver,
                       dataset,
                       rank,
                       pretrained=pretrained,
                       pretrained_weight=pretrained_weight,
                       g_step=g_step,
                       d_step=d_step,
                       entcoeff=policy_entcoeff,
                       max_timesteps=num_timesteps,
                       ckpt_dir=checkpoint_dir,
                       log_dir=log_dir,
                       save_per_iter=save_per_iter,
                       timesteps_per_batch=env.env.horizon * 5,
                       max_kl=0.01,
                       cg_iters=10,
                       cg_damping=0.1,
                       gamma=0.995,
                       lam=0.97,
                       vf_iters=5,
                       vf_stepsize=1e-4,
                       mix_reward=mix_reward,
                       r_lambda=rew_lambda,
                       task_name=task_name,
                       frame_stack=frame_stack)

    elif algo == 'ppo':
        from baselines.gail import ppo_mpi
        ppo_mpi.learn(
            env,
            policy_fn,
            reward_giver,
            dataset,
            rank,
            pretrained=pretrained,
            pretrained_weight=pretrained_weight,
            g_step=g_step,
            d_step=d_step,
            entcoeff=policy_entcoeff,
            max_timesteps=num_timesteps,
            ckpt_dir=checkpoint_dir,
            log_dir=log_dir,
            save_per_iter=save_per_iter,
            timesteps_per_batch=env.env.horizon,  #env.env.horizon * 5,
            gamma=0.995,
            lam=0.97,
            clip_param=0.2,
            optim_epochs=50,
            optim_stepsize=1e-4,
            optim_batchsize=100,  # optimization hypers
            mix_reward=mix_reward,
            r_lambda=rew_lambda,
            task_name=task_name,
            frame_stack=frame_stack)
    else:
        raise NotImplementedError
Пример #7
0
def train(env,
          seed,
          policy_fn,
          reward_giver,
          dataset,
          algo,
          g_step,
          d_step,
          policy_entcoeff,
          num_timesteps,
          save_per_iter,
          checkpoint_dir,
          log_dir,
          pretrained,
          BC_max_iter,
          task_name=None):

    pretrained_weight = None
    if pretrained and (BC_max_iter > 0):
        # Pretrain with behavior cloning
        from baselines.gail import behavior_clone
        pretrained_weight = behavior_clone.learn(env,
                                                 policy_fn,
                                                 dataset,
                                                 max_iters=BC_max_iter)

    if algo == 'trpo':
        from baselines.gail import trpo_mpi
        # Set up for MPI seed
        rank = MPI.COMM_WORLD.Get_rank()
        if rank != 0:
            logger.set_level(logger.DISABLED)
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
        set_global_seeds(workerseed)
        env.seed(workerseed)
        trpo_mpi.learn(
            env,
            policy_fn,
            reward_giver,
            dataset,
            rank,
            pretrained=pretrained,
            pretrained_weight=pretrained_weight,  #no pretrain
            g_step=g_step,
            d_step=
            d_step,  #'--g_step', help='number of steps to train policy in each epoch', type=int, default=3)
            #'--d_step', help='number of steps to train discriminator in each epoch', type=int, default=1)
            entcoeff=policy_entcoeff,  #default=0
            max_timesteps=
            num_timesteps,  #'number of timesteps per episode', type=int, default=5e6)
            ckpt_dir=checkpoint_dir,
            log_dir=log_dir,  #'the directory to save log file', default='log')
            save_per_iter=
            save_per_iter,  #'save model every xx iterations', type=int, default=100)
            timesteps_per_batch=1024,
            max_kl=0.01,
            cg_iters=10,
            cg_damping=0.1,
            gamma=0.995,
            lam=0.97,
            vf_iters=5,
            vf_stepsize=1e-3,
            task_name=task_name)
    else:
        raise NotImplementedError