Пример #1
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo_pnp import mlp_policy, pposgd_simple, interactive_ppo, ppo_gail
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=3)

    env = JacoEnv(64, 64, 1, 1.0)  #make_mujoco_env(env_id, seed)
    dataset = Mujoco_Dset(expert_path='data/pnp_demo.npz', traj_limitation=-1)
    reward_giver = TransitionClassifier(env, 100, entcoeff=1e-3)
    ppo_gail.learn(
        env,
        policy_fn,
        reward_giver,
        dataset,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
    )
    env.close()
Пример #2
0
def main(args):
    """
    start training the model

    :param args: (ArgumentParser) the training argument
    """
    with tf_util.make_session(num_cpu=1):
        set_global_seeds(args.seed)
        env = gym.make(args.env_id)

        def policy_fn(name,
                      ob_space,
                      ac_space,
                      reuse=False,
                      placeholders=None,
                      sess=None):
            return mlp_policy.MlpPolicy(name=name,
                                        ob_space=ob_space,
                                        ac_space=ac_space,
                                        reuse=reuse,
                                        sess=sess,
                                        hid_size=args.policy_hidden_size,
                                        num_hid_layers=2,
                                        placeholders=placeholders)

        env = bench.Monitor(
            env,
            logger.get_dir()
            and os.path.join(logger.get_dir(), "monitor.json"))
        env.seed(args.seed)
        gym.logger.setLevel(logging.WARN)
        task_name = get_task_name(args)
        args.checkpoint_dir = os.path.join(args.checkpoint_dir, task_name)
        args.log_dir = os.path.join(args.log_dir, task_name)

        if args.task == 'train':
            dataset = MujocoDset(expert_path=args.expert_path,
                                 traj_limitation=args.traj_limitation)
            reward_giver = TransitionClassifier(
                env,
                args.adversary_hidden_size,
                entcoeff=args.adversary_entcoeff)
            train(env, args.seed, policy_fn, reward_giver, dataset, args.algo,
                  args.g_step, args.d_step, args.policy_entcoeff,
                  args.num_timesteps, args.save_per_iter, args.checkpoint_dir,
                  args.pretrained, args.bc_max_iter, task_name)
        elif args.task == 'evaluate':
            runner(env,
                   policy_fn,
                   args.load_model_path,
                   timesteps_per_batch=1024,
                   number_trajs=10,
                   stochastic_policy=args.stochastic_policy,
                   save=args.save_sample)
        else:
            raise NotImplementedError
        env.close()
Пример #3
0
def main(args):
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    reuse=reuse,
                                    hid_size=args.policy_hidden_size,
                                    num_hid_layers=2)

    # env = bench.Monitor(env, logger.get_dir() and
    #                     osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    if args.log_dir != Log_dir:
        log_dir = osp.join(Log_dir, args.log_dir)
        save_dir = osp.join(Checkpoint_dir, args.log_dir)
    else:
        log_dir = Log_dir
        save_dir = Checkpoint_dir

    logger.configure(dir=log_dir,
                     log_suffix=task_name,
                     format_strs=["log", "stdout"])

    if args.task == 'train':

        log_dir, data_dir, policy_model_dir, __, _ = get_dirs(args)
        print("log_dir: ", log_dir)
        print("model_dir: ", policy_model_dir)
        #exp_data = get_exp_data2(osp.join(osp.dirname(osp.realpath(__file__)), "../../data/mujoco/%s.pkl" % args.env_id))

        data_path = data_dir + '/expert_sample'
        # eric version
        exp_data = get_exp_data(data_path, args.num_trajs)
        dataset = Dataset(exp_data)
        reward_giver = TransitionClassifier(env,
                                            args.adversary_hidden_size,
                                            entcoeff=args.adversary_entcoeff)
        train(env, args.seed, policy_fn, reward_giver, dataset, args.algo,
              args.g_step, args.d_step, args.policy_entcoeff,
              args.num_timesteps, args.num_iters, args.save_per_iter, save_dir,
              args.pretrained, args.BC_max_iter, task_name)
    elif args.task == 'evaluate':
        runner(env,
               policy_fn,
               args.load_model_path,
               timesteps_per_batch=1024,
               number_trajs=10,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample)
    else:
        raise NotImplementedError
    env.close()
Пример #4
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    #env = gym.make(args.env_id)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.CNNPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
    #env = bench.Monitor(env, logger.get_dir() and
    #                    osp.join(logger.get_dir(), "monitor.json"))

    env = make_vec_env(args.env_id, 'atari', 1, args.seed,
                       wrapper_kwargs={
                           'clip_rewards':False,
                           'episode_life':False,
                       })
    env = VecFrameStack(env, 4)

    #env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    if args.task == 'train':
        dataset = LMDB_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)
        reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
        train(env,
              args.seed,
              policy_fn,
              reward_giver,
              dataset,
              args.algo,
              args.g_step,
              args.d_step,
              args.policy_entcoeff,
              args.num_timesteps,
              args.save_per_iter,
              args.checkpoint_dir,
              args.log_dir,
              args.pretrained,
              args.BC_max_iter,
              task_name
              )
    elif args.task == 'evaluate':
        runner(env,
               policy_fn,
               args.load_model_path,
               timesteps_per_batch=1024,
               number_trajs=10,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample
               )
    else:
        raise NotImplementedError
    env.close()
Пример #5
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)
    env = DelayRewardWrapper(env, args.delay_freq, args.max_path_length)
    eval_env = gym.make(args.env_id)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2, gaussian_fixed_var=args.gaussian_fixed_var)
    env.seed(args.seed)
    eval_env.seed(args.seed)

    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    if args.task == 'train':
        dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)
        reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
        train(env,
              eval_env,
              args.seed,
              policy_fn,
              reward_giver,
              dataset,
              args.algo,
              args.g_step,
              args.d_step,
              args.policy_entcoeff,
              args.save_per_iter,
              args.checkpoint_dir,
              args.log_dir,
              args.pretrained,
              args.BC_max_iter,
              args.num_epochs,
              args.evaluation_freq,
              args.timesteps_per_batch,
              task_name,
              )
    elif args.task == 'evaluate':
        runner(env,
               policy_fn,
               args.load_model_path,
               timesteps_per_batch=args.timesteps_per_batch,
               number_trajs=10,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample
               )
    else:
        raise NotImplementedError
    env.close()
Пример #6
0
def irl(env, trajectories, discount, seed, log_dir, *,
        tf_cfg, policy_cfg=None, gan_cfg=None, train_cfg=None):
    dataset = _make_dset(trajectories)

    train_graph = tf.Graph()
    with train_graph.as_default():
        tf.set_random_seed(seed)

        policy_fn = _policy_factory(policy_cfg)

        gan_kwargs = {'hidden_size': 100}
        if gan_cfg is not None:
            gan_kwargs.update(gan_cfg)
        reward_giver = TransitionClassifier(env, **gan_kwargs)

        train_kwargs = {
            'pretrained': False,
            'BC_max_iter': 10000,
            'g_step': 3, # number of steps to train policy in each epoch
            'd_step': 1, # number of steps to train discriminator in each epoch
            'entcoeff': 0, # entropy coefficiency of policy
            'max_timesteps': 5e6, # number of timesteps per episode
            'timesteps_per_batch': 1024,
            'max_kl': 0.01,
            'cg_iters': 10,
            'cg_damping': 0.1,
            'lam': 0.97,
            'vf_iters': 5,
            'vf_stepsize': 1e-3,
        }
        if train_cfg is not None:
            train_kwargs.update(train_cfg)

        pretrained_weight = None
        bc_max_iter = train_kwargs.pop('BC_max_iter')
        if train_kwargs['pretrained']:
            # Pretrain with behavior cloning
            pretrained_weight = behavior_clone.learn(env, policy_fn, dataset,
                                         max_iters=bc_max_iter)
        ckpt_dir = osp.join(log_dir, 'checkpoints')

        with tf.Session(config=tf_cfg) as sess:
            trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank=0,
                           pretrained_weight=pretrained_weight,
                           ckpt_dir=ckpt_dir, log_dir=log_dir,
                           gamma=discount, save_per_iter=100,
                           task_name='gail', **train_kwargs)

            policy_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'pi')
            policy_serialised = sess.run(policy_vars)

    return None, policy_serialised
Пример #7
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and
                        osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    if args.task == 'train':
        dataset = AtariDataset(data_path=args.expert_path, game='pinball', max_nb_transitions=5) #TODO: change max_nb_transitions
        reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
        train(env,
              args.seed,
              policy_fn,
              reward_giver,
              dataset,
              args.algo,
              args.g_step,
              args.d_step,
              args.policy_entcoeff,
              args.num_timesteps,
              args.save_per_iter,
              args.checkpoint_dir,
              args.log_dir,
              args.pretrained,
              args.BC_max_iter,
              task_name
              )
    elif args.task == 'evaluate':
        runner(env,
               policy_fn,
               args.load_model_path,
               timesteps_per_batch=256,
               number_trajs=10,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample,
               play=args.play,
               )
    else:
        raise NotImplementedError
    env.close()
Пример #8
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    reuse=reuse,
                                    hid_size=args.policy_hidden_size,
                                    num_hid_layers=2)

    if args.task == 'train':
        env = bench.Monitor(
            env,
            logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
        env.seed(args.seed)
        gym.logger.setLevel(logging.WARN)
        task_name = get_task_name(args)

        os.makedirs(args.log_dir, exist_ok=True)
        with open(osp.join(args.log_dir, 'args.txt'), 'w') as f:
            f.write(str(args))

        args.checkpoint_dir = osp.join(args.log_dir, 'chckpts')
        os.makedirs(args.checkpoint_dir, exist_ok=True)

        dataset = Mujoco_Dset(expert_path=args.expert_path,
                              traj_limitation=args.traj_limitation)
        reward_giver = TransitionClassifier(env,
                                            args.adversary_hidden_size,
                                            entcoeff=args.adversary_entcoeff)
        train(env, args.seed, policy_fn, reward_giver, dataset, args.algo,
              args.g_step, args.d_step, args.policy_entcoeff,
              args.num_timesteps, args.save_per_iter, args.checkpoint_dir,
              args.log_dir, args.pretrained, args.BC_max_iter, task_name)
    elif args.task == 'evaluate':
        runner(env,
               policy_fn,
               args.load_model_path,
               timesteps_per_batch=1024,
               number_trajs=10,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample)
    else:
        raise NotImplementedError
    env.close()
Пример #9
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = robosuite.make(args.env_id,
            ignore_done=True,
            use_camera_obs=False,
            has_renderer=True,
            control_freq=100,
            gripper_visualization=True,
            reward_shaping=True,
            #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
            #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
            #box_pos = [0.23522776, 0.2287869, 0.82162434], #shift3
            #box_quat=[0.3775825618903728, 0, 0, 0.679425538604203], #shift3
            #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4
            #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4
            #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5 
            #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5
            #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6
            #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6
            box_pos = [0.43522776, -0.367869, 0.82162434], #shift7
            box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7
            ) # Switch from gym to robosuite, also add reward shaping to see reach goal

    env = GymWrapper(env) # wrap in the gym environment

    # Environment joints should be clipped at 1 and -1 for sawyer

    
    # Task
    #task = 'train'
    task = 'evaluate'
    # parser.add_argument('--task', type=str, choices=['train', 'evaluate', 'sample'], default='train')

    # Expert Path
    #expert_path = '/home/mastercljohnson/Robotics/GAIL_Part/mod_surreal/robosuite/models/assets/demonstrations/ac100/combined/combined_0.npz' # path for 100 trajectories
    expert_path = '/home/mastercljohnson/Robotics/GAIL_Part/mod_surreal/robosuite/models/assets/demonstrations/120_shift7/combined/combined_0.npz' # path for 100 trajectories

    #parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz')
    
    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy_sawyer.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and
                        osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True)
    
    #env.seed(args.seed) # Sawyer does not have seed 

    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    #if not os.path.isdir(args.log_dir):
    #    os.makedirs(args.log_dir)

    logger.log("log_directories: ",args.log_dir)
    
    logger.log("environment action space range: ", env.action_space) #logging the action space

    if task == 'train':
        dataset = Mujoco_Dset(expert_path=expert_path, traj_limitation=args.traj_limitation)

        # Check dimensions of the dataset
        #print("dimension of inputs", dataset.dset.inputs.shape) # dims seem correct
        #print("dimension of inputs", dataset.dset.labels.shape) # dims seem correct

        reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
        train(env,
              args.seed,
              policy_fn,
              reward_giver,
              dataset,
              args.algo,
              args.g_step,
              args.d_step,
              args.policy_entcoeff,
              args.num_timesteps,
              args.save_per_iter,
              args.checkpoint_dir,
              args.log_dir,
              args.pretrained,
              args.BC_max_iter,
              task_name
              )
    elif task == 'evaluate':
        # Create the playback environment
        play_env = robosuite.make(args.env_id,
                ignore_done=True,
                use_camera_obs=False,
                has_renderer=True,
                control_freq=100,
                gripper_visualization=True,
                #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
                #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
                #box_pos = [0.23522776, 0.2287869, 0.82162434], #shift3
                #box_quat=[0.3775825618903728, 0, 0, 0.679425538604203], #shift3
                #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4
                #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4
                #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5 
                #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5
                #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6
                #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6
                box_pos = [0.43522776, -0.367869, 0.82162434], #shift7
                box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7
                )

        #play_env.viewer.set_camera(camera_id=2) # Switch views for eval

        runner(env,
                play_env,
                policy_fn,
                args.load_model_path,
                timesteps_per_batch=4000, # Change time step per batch to be more reasonable
                number_trajs=20, # change from 10 to 1 for evaluation
                stochastic_policy=args.stochastic_policy,
                save=args.save_sample
                )
    else:
        raise NotImplementedError
    env.close()
Пример #10
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    reuse=reuse,
                                    hid_size=args.policy_hidden_size,
                                    num_hid_layers=2)

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)
    #cmp_logdir = osp.join(args.log_dir, task_name)
    if MPI.COMM_WORLD.Get_rank() == 0:
        #writer = SummaryWriter(comment=task_name)
        writer = tf.summary.FileWriter(args.log_dir, U.get_session().graph)
    else:
        writer = None
    if args.task == 'train':
        dataset = Mujoco_Dset(expert_path=args.expert_path,
                              traj_limitation=args.traj_limitation)
        reward_giver = TransitionClassifier(env,
                                            args.adversary_hidden_size,
                                            entcoeff=args.adversary_entcoeff)
        train(env,
              args.seed,
              policy_fn,
              reward_giver,
              dataset,
              args.algo,
              args.g_step,
              args.d_step,
              args.policy_entcoeff,
              args.num_timesteps,
              args.save_per_iter,
              args.checkpoint_dir,
              args.log_dir,
              args.pretrained,
              args.BC_max_iter,
              task_name=task_name,
              writer=writer)
    elif args.task == 'evaluate':
        runner(env,
               policy_fn,
               args.load_model_path,
               timesteps_per_batch=1024,
               number_trajs=10,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample)
    else:
        raise NotImplementedError
    env.close()
Пример #11
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)

    import MujocoManip as MM
    if args.task == 'train':
        env_name, user_name = osp.basename(
            args.expert_path).split('.')[0].split('_')
    else:
        env_name, user_name = osp.basename(args.load_model_path).split('.')[:2]
    wrapper = '%sWrapper' % env_name
    render = True if args.task == 'evaluate' else False

    if env_name == 'SawyerLiftEnv':
        env = MM.make(wrapper,
                      ignore_done=False,
                      use_eef_ctrl=False,
                      gripper_visualization=True,
                      use_camera_obs=False,
                      has_renderer=render,
                      reward_shaping=True,
                      has_offscreen_renderer=render)
    elif env_name == 'SawyerBinsEnv':
        env = MM.make(
            wrapper,
            ignore_done=False,
            use_eef_ctrl=False,
            gripper_visualization=True,
            use_camera_obs=False,
            has_renderer=render,
            reward_shaping=True,
            single_object_mode=False if 'hard' in user_name.lower() else True,
            has_offscreen_renderer=render)
    elif env_name == 'SawyerPegsEnv':
        env = MM.make(
            wrapper,
            ignore_done=False,
            use_eef_ctrl=False,
            gripper_visualization=True,
            use_camera_obs=False,
            has_renderer=render,
            reward_shaping=True,
            single_object_mode=False if 'hard' in user_name.lower() else True,
            has_offscreen_renderer=render)
    else:
        raise NotImplementedError

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    reuse=reuse,
                                    hid_size=args.policy_hidden_size,
                                    num_hid_layers=2)

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(env_name, user_name) + '_%s_%s' % (
        args.algo, 1 if not args.mix_reward else args.rew_lambda)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)
    os.makedirs(args.log_dir, exist_ok=True)

    if args.task == 'train':
        dataset = Mujoco_Dset(expert_path=args.expert_path,
                              traj_limitation=args.traj_limitation)
        reward_giver = TransitionClassifier(env,
                                            args.adversary_hidden_size,
                                            entcoeff=args.adversary_entcoeff)
        train(env, args.seed, policy_fn, reward_giver, dataset, args.algo,
              args.g_step, args.d_step, args.policy_entcoeff,
              args.num_timesteps, args.save_per_iter, args.checkpoint_dir,
              args.log_dir, args.pretrained, args.BC_max_iter, args.rew_lambda,
              args.mix_reward, task_name, args.frame_stack)
    elif args.task == 'evaluate':
        visualizer(env,
                   policy_fn,
                   args.load_model_path,
                   timesteps_per_batch=env.env.horizon,
                   number_trajs=10,
                   stochastic_policy=args.stochastic_policy,
                   save=args.save_sample)
    else:
        raise NotImplementedError
    env.close()
Пример #12
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = robosuite.make(
        args.env_id,
        ignore_done=True,
        use_camera_obs=False,
        has_renderer=True,
        control_freq=100,
        gripper_visualization=True,
        reward_shaping=True,
        #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
        #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
    )  # Switch from gym to robosuite, also add reward shaping to see reach goal

    env = GymWrapper(env)  # wrap in the gym environment

    #task = 'train'
    task = 'evaluate'

    # Expert Path
    expert_path = '/home/mastercljohnson/Robotics/GAIL_Part/mod_surreal/robosuite/models/assets/demonstrations/150_grasp_shift2/combined/combined_0.npz'  # path for 100 trajectories

    #parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz')

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy_sawyer.MlpPolicy(name=name,
                                           ob_space=ob_space,
                                           ac_space=ac_space,
                                           reuse=reuse,
                                           hid_size=args.policy_hidden_size,
                                           num_hid_layers=2)

    env = bench.Monitor(env,
                        logger.get_dir()
                        and osp.join(logger.get_dir(), "monitor.json"),
                        allow_early_resets=True)

    # Note: taking away the bench monitor wrapping allows rendering

    #env.seed(args.seed) # Sawyer does not have seed

    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    logger.log("log_directories: ", args.log_dir)
    logger.log("environment action space range: ",
               env.action_space)  #logging the action space

    #------- Run policy for reaching ---------#
    play_env = robosuite.make(
        args.env_id,
        ignore_done=True,
        use_camera_obs=False,
        has_renderer=True,
        control_freq=100,
        gripper_visualization=True,
        #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
        #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
    )

    play_env = GymWrapper(play_env)

    #Weights are loaded from reach model grasp_strange

    #play_env.viewer.set_camera(camera_id=2) # Switch views for eval

    # Setup network
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi_reach = policy_fn("pi", ob_space, ac_space, reuse=False)

    # Hack for loading policies using tensorflow
    init_op = tf.compat.v1.global_variables_initializer()
    saver = tf.compat.v1.train.Saver(max_to_keep=5)
    with tf.compat.v1.Session() as sess:
        sess.run(init_op)
        # Load Checkpoint
        ckpt_path = './reach_and_grasp_weights/reach_one/trpo_gail.transition_limitation_2100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        ckpt = tf.compat.v1.train.get_checkpoint_state(ckpt_path)
        saver.restore(sess, ckpt.model_checkpoint_path)

        # Create the playback environment

        _, _, last_ob, last_jpos = runner_1_traj(
            play_env,
            pi_reach,
            None,
            timesteps_per_batch=3500,
            number_trajs=1,
            stochastic_policy=args.stochastic_policy,
            save=False)

    if task == 'train':
        play_env.close()

        dataset = Mujoco_Dset(expert_path=expert_path,
                              traj_limitation=args.traj_limitation)

        reward_giver = TransitionClassifier(env,
                                            args.adversary_hidden_size,
                                            entcoeff=args.adversary_entcoeff)
        train_grasp(env, last_ob, last_jpos, args.seed, policy_fn,
                    reward_giver, dataset, args.algo, args.g_step, args.d_step,
                    args.policy_entcoeff, args.num_timesteps,
                    args.save_per_iter, args.checkpoint_dir, args.log_dir,
                    args.pretrained, args.BC_max_iter, task_name)

    elif task == 'evaluate':
        pi_grasp = policy_fn("pi_grasp", ob_space, ac_space, reuse=False)
        saver_2 = tf.compat.v1.train.Saver(max_to_keep=5)
        with tf.compat.v1.Session() as sess:
            sess.run(init_op)
            ckpt_path_2 = './reach_and_grasp_weights/grasp_shift1_after_reach/grasptrpo_gail.transition_limitation_2000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
            ckpt_2 = tf.compat.v1.train.get_checkpoint_state(ckpt_path_2)
            saver_2.restore(sess, ckpt_2.model_checkpoint_path)

            tt = 0
            ob = last_ob

            while True:
                ac, vpred = pi_grasp.act(False, ob)
                ob, rew, new, _ = play_env.step(ac)

                play_env.render()  # check the running in for the first part
                #logger.log("rendering for reach policy")

                if new or tt >= args.traj_limitation:
                    break
                tt += 1

        play_env.close()

    env.close()
Пример #13
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)
    logger.configure()

    env = bench.Monitor(env, logger.get_dir() and
                        osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    if args.task == 'train':
        from baselines.gail import mlp_policy
        def policy_fn(name, ob_space, ac_space, reuse=False):
          return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                      reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
        dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)
        if args.states_only:
            reward_giver = WeakTransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
        else:
            reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
        train(env,
              args.seed,
              policy_fn,
              reward_giver,
              dataset,
              args.algo,
              args.g_step,
              args.d_step,
              args.policy_entcoeff,
              args.num_timesteps,
              args.save_per_iter,
              args.checkpoint_dir,
              args.log_dir,
              args.pretrained,
              args.BC_max_iter,
              task_name,
              args.states_only
              )
    elif args.task == 'evaluate':
        from baselines.gail import mlp_policy
        def policy_fn(name, ob_space, ac_space, reuse=False):
            return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                        reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
        runner(env,
               policy_fn,
               args.load_model_path,
               timesteps_per_batch=1024,
               number_trajs=args.traj_limitation,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample
               )
    elif args.task == 'expert_train':
        from baselines.trpo_mpi import trpo_mpi as original_trpo
        from baselines.ppo1.mlp_policy import MlpPolicy as OriginalMlpPolicy
        def policy_fn(name, ob_space, ac_space, reuse=False):
            return OriginalMlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    hid_size=args.policy_hidden_size, num_hid_layers=2)
        original_trpo.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
            max_timesteps=args.num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
        saver = tf.train.Saver()
        saver.save(tf.get_default_session(), args.save_model_path)
    elif args.task == 'expert_gen':
        from baselines.trpo_mpi import trpo_mpi as original_trpo
        from baselines.ppo1.mlp_policy import MlpPolicy as OriginalMlpPolicy
        def policy_fn(name, ob_space, ac_space, reuse=False):
            return OriginalMlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    hid_size=args.policy_hidden_size, num_hid_layers=2)
        runner(env,
               policy_fn,
               args.save_model_path,
               timesteps_per_batch=1024,
               number_trajs=args.traj_limitation,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample
                              )
    else:
        raise NotImplementedError
    env.close()
Пример #14
0
def setup_and_learn(env,
                    nb_epochs,
                    nb_epoch_cycles,
                    render_eval,
                    reward_scale,
                    render,
                    actor,
                    critic,
                    classifier,
                    normalize_returns,
                    normalize_observations,
                    critic_l2_reg,
                    classifier_l2_reg,
                    actor_lr,
                    critic_lr,
                    classifier_lr,
                    action_noise,
                    popart,
                    gamma,
                    clip_norm,
                    nb_train_steps,
                    nb_rollout_steps,
                    nb_eval_steps,
                    batch_size,
                    memory,
                    fifomemory,
                    tau=0.01,
                    eval_env=None,
                    callback=None,
                    entropy_coeff=1.,
                    reward_giver=None,
                    expert_dataset=None,
                    g_step=4,
                    d_step=1,
                    d_stepsize=3e-4,
                    max_timesteps=0,
                    max_iters=0,
                    timesteps_per_batch=1024,
                    adversary_hidden_size=100,
                    adversary_entcoeff=1e-3,
                    task='train',
                    expert_path=None):  # TODO: max_episodes
    """
    set up learning agent and execute training
    """
    logger.info('Initialize policy')
    logger.info('noisynet implementation of DDPG')

    assert task == 'train'

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG_paramnoise(actor,
                            critic,
                            classifier,
                            memory,
                            fifomemory,
                            env.observation_space.shape,
                            env.action_space.shape,
                            gamma=gamma,
                            tau=tau,
                            normalize_returns=normalize_returns,
                            normalize_observations=normalize_observations,
                            batch_size=batch_size,
                            action_noise=action_noise,
                            critic_l2_reg=critic_l2_reg,
                            classifier_l2_reg=classifier_l2_reg,
                            actor_lr=actor_lr,
                            critic_lr=critic_lr,
                            classifier_lr=classifier_lr,
                            enable_popart=popart,
                            clip_norm=clip_norm,
                            reward_scale=reward_scale,
                            entropy_coeff=entropy_coeff)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    logger.info('Initialize Discriminator')
    reward_giver = TransitionClassifier(env,
                                        adversary_hidden_size,
                                        entcoeff=adversary_entcoeff)
    d_adam = MpiAdam(reward_giver.get_trainable_variables())

    logger.info('Load Expert Data')
    dataset = Mujoco_Dset(expert_path=expert_path,
                          traj_limitation=-1)  # TODO: customize

    logger.info('Start training')
    with U.single_threaded_session() as sess:
        # init agent
        agent.initialize(sess)
        # tf saver
        saver = tf.train.Saver()
        # finalize graph
        sess.graph.finalize()

        learn(
            env,
            agent,
            reward_giver,
            dataset,
            g_step,
            d_step,
            d_stepsize=d_stepsize,
            timesteps_per_batch=timesteps_per_batch,
            nb_train_steps=nb_train_steps,
            max_timesteps=max_timesteps,
            max_iters=max_iters,  # TODO: max_episodes
            callback=callback,
            d_adam=d_adam,
            sess=sess,
            saver=saver)