Exemplo n.º 1
0
 def net_for_problem(self,
                     problem_ppddl_paths,
                     prob_name=None,
                     dropout=0.0):
     """Instantiate a network and environment for a specific problem. Also
     creates a handler for the problem (maybe this is a bad idea? IDK)."""
     all_ppddl_paths = self.extra_ppddl + list(problem_ppddl_paths)
     if prob_name is None:
         # get it automatically (and slowly…)
         prob_name = get_problem_names(all_ppddl_paths)[0]
     service_config = ProblemServiceConfig(
         all_ppddl_paths,
         prob_name,
         heuristic_name=self.heuristic_data_gen_name,
         # doesn't matter, use a cheap one
         teacher_heur='h-add',
         use_lm_cuts=self.use_lm_cuts)
     problem_server = ProblemServer(service_config)
     single_problem = SingleProblem(prob_name, problem_server)
     planner_exts = PlannerExtensions(service_config.pddl_files,
                                      service_config.init_problem_name)
     raw_env, flat_env = create_environment(
         single_problem.prob_meta,
         planner_exts,
         heuristic_name=self.heuristic_data_gen_name)
     problem_network = PropNetwork(self.weight_manager,
                                   single_problem.prob_meta,
                                   dropout=dropout,
                                   norm_response=self.norm_response)
     # env_spec is SingleProblem.env_spec
     policy = CategoricalMLPPolicy(env_spec=single_problem.env_spec,
                                   prob_network=problem_network,
                                   name='policy')
     # returns policy, RLLab environment, problem server handles
     return InstantiatedNetwork(self, policy, flat_env, single_problem)
Exemplo n.º 2
0
def train_gail(session,
               env,
               dataset,
               obs_dim=1,
               act_dim=2,
               n_itr=20,
               use_env_rewards=False,
               discount=.99,
               batch_size=4000,
               critic_scale=1.,
               gail_step_size=.01,
               critic_learning_rate=.001,
               policy_hid_layer_dims=[32, 32],
               gradient_penalty=.1,
               critic_n_train_epochs=1,
               sampler_args=dict(),
               return_algo=False):

    network = CriticNetwork(hidden_layer_dims=[32, 32])
    critic = WassersteinCritic(obs_dim=obs_dim,
                               act_dim=act_dim,
                               dataset=dataset,
                               network=network,
                               verbose=2,
                               gradient_penalty=gradient_penalty,
                               optimizer=tf.train.AdamOptimizer(
                                   critic_learning_rate, beta1=.5, beta2=.9),
                               n_train_epochs=critic_n_train_epochs)
    policy = CategoricalMLPPolicy(name="policy",
                                  env_spec=env.spec,
                                  hidden_sizes=policy_hid_layer_dims)
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    reward_handler = RewardHandler(use_env_rewards=use_env_rewards,
                                   critic_final_scale=critic_scale)

    algo = GAIL(critic=critic,
                reward_handler=reward_handler,
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=batch_size,
                max_path_length=200,
                n_itr=n_itr,
                discount=discount,
                step_size=gail_step_size,
                sampler_args=sampler_args)
    session.run(tf.global_variables_initializer())

    if return_algo:
        return algo

    algo.train(sess=session)

    return policy, critic
Exemplo n.º 3
0
def main(
    log_dir,
    env_name,
    ent_coef,
    n_steps,
    total_timesteps,
    num_vec,
):
    tf.reset_default_graph()
    # n_steps is the `batch_size // num_vec` in `imitation`.
    batch_size = n_steps * num_vec
    n_itr = int(math.ceil(total_timesteps / batch_size))

    if env_name.startswith("airl/"):
        env_cls = CustomGymEnv
    else:
        env_cls = GymEnv
    env = TfEnv(env_cls(env_name, record_video=False, record_log=False))

    # NOTE: Haven't yet checked if hidden_sizes=(32, 32) matches the settings in
    # the `imitation` repo. We use the default Stable Baselines MLP policy.
    if isinstance(env.spec.action_space, Box):
        policy = GaussianMLPPolicy(name='policy',
                                   env_spec=env.spec,
                                   hidden_sizes=(32, 32))
    else:
        policy = CategoricalMLPPolicy(name='policy',
                                      env_spec=env.spec,
                                      hidden_sizes=(32, 32))

    with tf.Session(config=get_session_config()) as sess:
        algo = TRPO(
            env=env,
            policy=policy,
            n_itr=n_itr,
            batch_size=batch_size,
            max_path_length=500,
            discount=0.99,
            store_paths=True,
            entropy_weight=ent_coef,
            baseline=LinearFeatureBaseline(env_spec=env.spec),
            # Maybe it will be the case the not every policy is compatible with
            # the VectorizedSampler. In that case, consider changing to
            # `sampler_cls=None` and adding a dummy `n_envs` kwargs to BatchSampler.
            sampler_cls=VectorizedSampler,
            sampler_args=dict(n_envs=num_vec),
        )
        with rllab_logdir(algo=algo, dirname=log_dir):
            algo.train(sess)
Exemplo n.º 4
0
def run_experiment(**params):
    base_params = copy.copy(DEFAULTS)
    base_params.update(params)
    params = base_params

    grid_world = SlaveGridWorldEnv("3x3", goal_reward=params["goal_reward"])
    env = normalize(grid_world)
    baseline = LinearFeatureBaseline(env)

    policy = CategoricalMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=params["policy_hidden_dims"],
    )

    optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
        base_eps=1e-5))

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=params["batch_size"],
        max_path_length=5,
        n_itr=params["n_itr"],
        discount=0.99,
        step_size=params["step_size"],
        optimizer=optimizer,
    )

    run_experiment_lite(
        algo.train(),
        n_parallel=5,
        snapshot_mode="last",
        exp_prefix="grid_world_silent",
        variant=params,
    )
Exemplo n.º 5
0
 def test_saver(self):
     savepath = 'data/model'
     env = TfEnv(TwoRoundNondeterministicRewardEnv())
     with tf.Session() as session:
         policy = CategoricalMLPPolicy(name="policy",
                                       env_spec=env.spec,
                                       hidden_sizes=[2, 2])
         session.run(tf.global_variables_initializer())
         saver = tf.train.Saver(max_to_keep=100,
                                keep_checkpoint_every_n_hours=.5)
         saver.save(session, savepath, global_step=0)
         params = policy.get_params()
         initial_values = policy.get_param_values()
         assign = tf.group(
             *[tf.assign(p, tf.zeros_like(p)) for p in params])
         session.run(assign)
         self.assertEqual(np.sum(policy.get_param_values()), 0)
         latest = tf.train.latest_checkpoint('data')
         saver.restore(session, latest)
         final_values = policy.get_param_values()
         np.testing.assert_array_equal(initial_values, final_values)
Exemplo n.º 6
0
def rllab_envpolicy_parser(env, args):
    if isinstance(args, dict):
        args = tonamedtuple(args)

    env = RLLabEnv(env, mode=args.control)
    if args.algo[:2] == 'tf':
        env = TfEnv(env)

        # Policy
        if args.recurrent:
            if args.feature_net:
                feature_network = MLP(
                    name='feature_net',
                    input_shape=(env.spec.observation_space.flat_dim +
                                 env.spec.action_space.flat_dim, ),
                    output_dim=args.feature_output,
                    hidden_sizes=tuple(args.feature_hidden),
                    hidden_nonlinearity=tf.nn.tanh,
                    output_nonlinearity=None)
            elif args.conv:
                strides = tuple(args.conv_strides)
                chans = tuple(args.conv_channels)
                filts = tuple(args.conv_filters)

                assert len(strides) == len(chans) == len(
                    filts), "strides, chans and filts not equal"
                # only discrete actions supported, should be straightforward to extend to continuous
                assert isinstance(
                    env.spec.action_space,
                    Discrete), "Only discrete action spaces support conv"
                feature_network = ConvNetwork(
                    name='feature_net',
                    input_shape=env.spec.observation_space.shape,
                    output_dim=args.feature_output,
                    conv_filters=chans,
                    conv_filter_sizes=filts,
                    conv_strides=strides,
                    conv_pads=('VALID', ) * len(chans),
                    hidden_sizes=tuple(args.feature_hidden),
                    hidden_nonlinearity=tf.nn.relu,
                    output_nonlinearity=None)
            else:
                feature_network = None
            if args.recurrent == 'gru':
                if isinstance(env.spec.action_space, Box):
                    policy = GaussianGRUPolicy(env_spec=env.spec,
                                               feature_network=feature_network,
                                               hidden_dim=int(
                                                   args.policy_hidden[0]),
                                               name='policy')
                elif isinstance(env.spec.action_space, Discrete):
                    policy = CategoricalGRUPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden[0]),
                        name='policy',
                        state_include_action=False if args.conv else True)
                else:
                    raise NotImplementedError(env.spec.observation_space)

            elif args.recurrent == 'lstm':
                if isinstance(env.spec.action_space, Box):
                    policy = GaussianLSTMPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden),
                        name='policy')
                elif isinstance(env.spec.action_space, Discrete):
                    policy = CategoricalLSTMPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden),
                        name='policy')
                else:
                    raise NotImplementedError(env.spec.action_space)

            else:
                raise NotImplementedError(args.recurrent)
        elif args.conv:
            strides = tuple(args.conv_strides)
            chans = tuple(args.conv_channels)
            filts = tuple(args.conv_filters)

            assert len(strides) == len(chans) == len(
                filts), "strides, chans and filts not equal"
            # only discrete actions supported, should be straightforward to extend to continuous
            assert isinstance(
                env.spec.action_space,
                Discrete), "Only discrete action spaces support conv"
            feature_network = ConvNetwork(
                name='feature_net',
                input_shape=env.spec.observation_space.shape,
                output_dim=env.spec.action_space.n,
                conv_filters=chans,
                conv_filter_sizes=filts,
                conv_strides=strides,
                conv_pads=('VALID', ) * len(chans),
                hidden_sizes=tuple(args.policy_hidden),
                hidden_nonlinearity=tf.nn.relu,
                output_nonlinearity=tf.nn.softmax)
            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          prob_network=feature_network)
        else:
            if isinstance(env.spec.action_space, Box):
                policy = GaussianMLPPolicy(env_spec=env.spec,
                                           hidden_sizes=tuple(
                                               args.policy_hidden),
                                           min_std=args.min_std,
                                           name='policy')
            elif isinstance(env.spec.action_space, Discrete):
                policy = CategoricalMLPPolicy(env_spec=env.spec,
                                              hidden_sizes=tuple(
                                                  args.policy_hidden),
                                              name='policy')
            else:
                raise NotImplementedError(env.spec.action_space)
    elif args.algo[:2] == 'th':
        # Policy
        if args.recurrent:
            if args.feature_net:
                feature_network = thMLP(
                    input_shape=(env.spec.observation_space.flat_dim +
                                 env.spec.action_space.flat_dim, ),
                    output_dim=args.feature_output,
                    hidden_sizes=tuple(args.feature_hidden),
                    hidden_nonlinearity=tf.nn.tanh,
                    output_nonlinearity=None)
            else:
                feature_network = None
            if args.recurrent == 'gru':
                if isinstance(env.spec.observation_space, thBox):
                    policy = thGaussianGRUPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden[0]),
                    )
                elif isinstance(env.spec.observation_space, thDiscrete):
                    policy = thCategoricalGRUPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden[0]),
                    )
                else:
                    raise NotImplementedError(env.spec.observation_space)

            # elif args.recurrent == 'lstm':
            #     if isinstance(env.spec.action_space, thBox):
            #         policy = thGaussianLSTMPolicy(env_spec=env.spec,
            #                                       feature_network=feature_network,
            #                                       hidden_dim=int(args.policy_hidden),
            #                                       name='policy')
            #     elif isinstance(env.spec.action_space, thDiscrete):
            #         policy = thCategoricalLSTMPolicy(env_spec=env.spec,
            #                                          feature_network=feature_network,
            #                                          hidden_dim=int(args.policy_hidden),
            #                                          name='policy')
            #     else:
            #         raise NotImplementedError(env.spec.action_space)

            else:
                raise NotImplementedError(args.recurrent)
        else:
            if args.algo == 'thddpg':
                assert isinstance(env.spec.action_space, thBox)
                policy = thDeterministicMLPPolicy(
                    env_spec=env.spec,
                    hidden_sizes=tuple(args.policy_hidden),
                )
            else:
                if isinstance(env.spec.action_space, thBox):
                    policy = thGaussianMLPPolicy(env_spec=env.spec,
                                                 hidden_sizes=tuple(
                                                     args.policy_hidden),
                                                 min_std=args.min_std)
                elif isinstance(env.spec.action_space, thDiscrete):
                    policy = thCategoricalMLPPolicy(env_spec=env.spec,
                                                    hidden_sizes=tuple(
                                                        args.policy_hidden),
                                                    min_std=args.min_std)
                else:
                    raise NotImplementedError(env.spec.action_space)

    if args.control == 'concurrent':
        return env, policies
    else:
        return env, policy
Exemplo n.º 7
0
def make_policy(args,
                env_spec,
                dom_meta,
                prob_meta,
                dg_extra_dim=None,
                weight_manager=None):
    # size of input and output
    obs_dim = int(env_spec.observation_space.flat_dim)
    act_dim = int(env_spec.action_space.flat_dim)

    # can make normal FC MLP or an action/proposition network
    known_mo = set()  # type: Set[str]
    if args.model == 'simple':
        known_mo = {'hidden_size', 'num_layers'}
        mod = args.model_opts
        hidden_size = int(mod.get('hidden_size', 32))
        num_layers = int(mod.get('num_layers', 2))
        print('Layer size: %d' % hidden_size)
        print('Number of layers: %d' % num_layers)
        # The dense policy network should have two hidden layers, each with
        # <obs dim> units (thereabouts, anyway)
        custom_network = make_masked_mlp('simple_masked_mlp', obs_dim, act_dim,
                                         (hidden_size, ) * num_layers)
    elif args.model == 'actprop':
        known_mo = {'hidden_size', 'num_layers', 'dropout', 'norm_response'}
        mod = args.model_opts
        hs = int(mod.get('hidden_size', 8))
        num_layers = int(mod.get('num_layers', 2))
        dropout = float(mod.get('dropout', 0.0))
        norm_response = int(mod.get('norm_response', '0')) != 0
        print(
            'hidden_size: %d, num_layers: %d, dropout: %f, norm_response: %d' %
            (hs, num_layers, dropout, int(norm_response)))
        if weight_manager is not None:
            print('Re-using same weight manager')
        elif args.resume_from:
            print('Reloading weight manager (resuming training)')
            weight_manager = joblib.load(args.resume_from)
        else:
            print('Creating new weight manager (not resuming)')
            # TODO: should save all network metadata, including heuristic
            # configuration
            # extra_dim = sum([g.extra_dim for g in data_gens])
            weight_manager = PropNetworkWeights(
                dom_meta,
                hidden_sizes=[(hs, hs)] * num_layers,
                # extra inputs to each action module from data generators
                extra_dim=dg_extra_dim)
        custom_network = PropNetwork(weight_manager,
                                     prob_meta,
                                     dropout=dropout,
                                     norm_response=norm_response)
    else:
        raise ValueError('Unknown network type "%s"' % args.model)

    # What if a model option wasn't used?
    unknown_mo = args.model_opts.keys() - known_mo
    if unknown_mo:
        print('WARNING: model options not understood by "%s" network: %s' %
              (args.model, ', '.join(unknown_mo)),
              file=sys.stderr)

    policy = CategoricalMLPPolicy(env_spec=env_spec,
                                  prob_network=custom_network,
                                  name='policy')

    # weight_manager will sometimes be None
    return policy, weight_manager
Exemplo n.º 8
0
def fu_irl(
    venv,
    is_airl,
    expert=None,
    expert_venv=None,
    expert_trajectories=None,
    total_timesteps=10000,
    gen_batch_size=200,
    policy_lr=1e-3,
    callback=None,
    **kwargs,
):
    # Disable algorithm's internal prints
    old_stdout = sys.stdout
    sys.stdout = open(os.devnull, 'w')

    raw_env = get_raw_env(venv)
    tf_env = TfEnv(GymEnv(env=raw_env, record_video=False, record_log=False))

    if expert_trajectories is None:
        expert_trajectories = sample_trajectories(
            expert_venv, expert, n_episodes=total_timesteps
        )
    expert_trajectories = to_rllab_trajectories(expert_trajectories, venv)

    if is_airl:
        irl_model = AIRLStateAction(
            env_spec=tf_env.spec, expert_trajs=expert_trajectories
        )
        entropy_weight = 1.0
    else:
        irl_model = GAIL(env_spec=tf_env.spec, expert_trajs=expert_trajectories)
        entropy_weight = 0.0

    if isinstance(venv.action_space, Discrete):
        policy = CategoricalMLPPolicy(
            name="policy", env_spec=tf_env.spec, hidden_sizes=(32, 32)
        )
    else:
        policy = GaussianMLPPolicy(
            name="policy", env_spec=tf_env.spec, hidden_sizes=(32, 32)
        )

    num_epochs = int(total_timesteps // gen_batch_size)

    algo = IRLTRPO(
        env=tf_env,
        policy=policy,
        irl_model=irl_model,
        n_itr=num_epochs,
        batch_size=gen_batch_size,
        max_path_length=100,
        discount=0.99,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=entropy_weight,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=tf_env.spec),
    )
    algo.train()

    sys.stdout = old_stdout

    def predict_fn(ob, state=None, deterministic=False):
        act, _ = algo.policy.get_action(ob)
        return act, state

    results = {}
    results["policy"] = LightweightRLModel(predict_fn=predict_fn, env=venv)

    return results
# if args.env not in supported_envs:
#     raise Exception("Env not supported! Try it out though?")

# Need to wrap in a tf environment and force_reset to true
# see https://github.com/openai/rllab/issues/87#issuecomment-282519288

register_custom_envs()

gymenv = GymEnv(args.env, force_reset=True)
# gymenv.env.seed(124)
env = TfEnv(normalize(gymenv, normalize_obs=False))

if type(env.spec.action_space) is Discrete:
    policy = CategoricalMLPPolicy(
    name="policy",
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32)
    )
else:
    policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    hidden_sizes=(100, 50, 25)
    )

baseline = LinearFeatureBaseline(env_spec=env.spec)

iters = args.num_iters

algo = TRPO(
    env=env,
Exemplo n.º 10
0
def get_policy(env):
    policy_network = get_policy_network(env)
    policy = CategoricalMLPPolicy(name='policy',
                                  env_spec=env.spec,
                                  prob_network=policy_network)
    return policy
Exemplo n.º 11
0
def main(_):

    env = TfEnv(
        AtariEnv(args.env,
                 force_reset=True,
                 record_video=False,
                 record_log=False,
                 resize_size=args.resize_size,
                 atari_noop=args.atari_noop,
                 atari_eplife=args.atari_eplife,
                 atari_firereset=args.atari_firereset))

    policy_network = ConvNetwork(
        name='prob_network',
        input_shape=env.observation_space.shape,
        output_dim=env.action_space.n,
        # number of channels/filters for each conv layer
        conv_filters=(16, 32),
        # filter size
        conv_filter_sizes=(8, 4),
        conv_strides=(4, 2),
        conv_pads=('VALID', 'VALID'),
        hidden_sizes=(256, ),
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=tf.nn.softmax,
        batch_normalization=False)
    policy = CategoricalMLPPolicy(name='policy',
                                  env_spec=env.spec,
                                  prob_network=policy_network)

    if (args.value_function == 'zero'):
        baseline = ZeroBaseline(env.spec)
    else:
        value_network = get_value_network(env)
        baseline_batch_size = args.batch_size * 10

        if (args.value_function == 'conj'):
            baseline_optimizer = ConjugateGradientOptimizer(
                subsample_factor=1.0, num_slices=args.num_slices)
        elif (args.value_function == 'adam'):
            baseline_optimizer = FirstOrderOptimizer(
                max_epochs=3,
                batch_size=512,
                num_slices=args.num_slices,
                verbose=True)
        else:
            logger.log("Inappropirate value function")
            exit(0)
        '''
      baseline = GaussianMLPBaseline(
          env.spec,
          num_slices=args.num_slices,
          regressor_args=dict(
              step_size=0.01,
              mean_network=value_network,
              optimizer=baseline_optimizer,
              subsample_factor=1.0,
              batchsize=baseline_batch_size,
              use_trust_region=False
          )
      )
      '''
        baseline = DeterministicMLPBaseline(env.spec,
                                            num_slices=args.num_slices,
                                            regressor_args=dict(
                                                network=value_network,
                                                optimizer=baseline_optimizer,
                                                normalize_inputs=False))

    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=args.batch_size,
                max_path_length=4500,
                n_itr=args.n_itr,
                discount=args.discount_factor,
                step_size=args.step_size,
                clip_reward=(not args.reward_no_scale),
                optimizer_args={
                    "subsample_factor": 1.0,
                    "num_slices": args.num_slices
                }
                #       plot=True
                )

    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=args.n_cpu,
                            inter_op_parallelism_threads=args.n_cpu)
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()
    algo.train(sess)
Exemplo n.º 12
0
def main():
    now = datetime.datetime.now(dateutil.tz.tzlocal())
    rand_id = str(uuid.uuid4())[:5]
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z')
    default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id)

    parser = argparse.ArgumentParser()
    parser.add_argument('--exp_name',
                        type=str,
                        default=default_exp_name,
                        help='Name of the experiment.')

    parser.add_argument('--discount', type=float, default=0.99)
    parser.add_argument('--gae_lambda', type=float, default=1.0)
    parser.add_argument('--reward_scale', type=float, default=1.0)

    parser.add_argument('--n_iter', type=int, default=250)
    parser.add_argument('--sampler_workers', type=int, default=1)
    parser.add_argument('--max_traj_len', type=int, default=250)
    parser.add_argument('--update_curriculum',
                        action='store_true',
                        default=False)
    parser.add_argument('--n_timesteps', type=int, default=8000)
    parser.add_argument('--control', type=str, default='centralized')

    parser.add_argument('--rectangle', type=str, default='10,10')
    parser.add_argument('--map_type', type=str, default='rectangle')
    parser.add_argument('--n_evaders', type=int, default=5)
    parser.add_argument('--n_pursuers', type=int, default=2)
    parser.add_argument('--obs_range', type=int, default=3)
    parser.add_argument('--n_catch', type=int, default=2)
    parser.add_argument('--urgency', type=float, default=0.0)
    parser.add_argument('--pursuit', dest='train_pursuit', action='store_true')
    parser.add_argument('--evade', dest='train_pursuit', action='store_false')
    parser.set_defaults(train_pursuit=True)
    parser.add_argument('--surround', action='store_true', default=False)
    parser.add_argument('--constraint_window', type=float, default=1.0)
    parser.add_argument('--sample_maps', action='store_true', default=False)
    parser.add_argument('--map_file', type=str, default='../maps/map_pool.npy')
    parser.add_argument('--flatten', action='store_true', default=False)
    parser.add_argument('--reward_mech', type=str, default='global')
    parser.add_argument('--catchr', type=float, default=0.1)
    parser.add_argument('--term_pursuit', type=float, default=5.0)

    parser.add_argument('--recurrent', type=str, default=None)
    parser.add_argument('--policy_hidden_sizes', type=str, default='128,128')
    parser.add_argument('--baselin_hidden_sizes', type=str, default='128,128')
    parser.add_argument('--baseline_type', type=str, default='linear')

    parser.add_argument('--conv', action='store_true', default=False)

    parser.add_argument('--max_kl', type=float, default=0.01)

    parser.add_argument('--checkpoint', type=str, default=None)

    parser.add_argument('--log_dir', type=str, required=False)
    parser.add_argument('--tabular_log_file',
                        type=str,
                        default='progress.csv',
                        help='Name of the tabular log file (in csv).')
    parser.add_argument('--text_log_file',
                        type=str,
                        default='debug.log',
                        help='Name of the text log file (in pure text).')
    parser.add_argument('--params_log_file',
                        type=str,
                        default='params.json',
                        help='Name of the parameter log file (in json).')
    parser.add_argument('--seed', type=int, help='Random seed for numpy')
    parser.add_argument('--args_data',
                        type=str,
                        help='Pickled data for stub objects')
    parser.add_argument('--snapshot_mode',
                        type=str,
                        default='all',
                        help='Mode to save the snapshot. Can be either "all" '
                        '(all iterations will be saved), "last" (only '
                        'the last iteration will be saved), or "none" '
                        '(do not save snapshots)')
    parser.add_argument(
        '--log_tabular_only',
        type=ast.literal_eval,
        default=False,
        help=
        'Whether to only print the tabular log information (in a horizontal format)'
    )

    args = parser.parse_args()

    parallel_sampler.initialize(n_parallel=args.sampler_workers)

    if args.seed is not None:
        set_seed(args.seed)
        parallel_sampler.set_seed(args.seed)

    args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(',')))

    if args.checkpoint:
        with tf.Session() as sess:
            data = joblib.load(args.checkpoint)
            policy = data['policy']
            env = data['env']
    else:
        if args.sample_maps:
            map_pool = np.load(args.map_file)
        else:
            if args.map_type == 'rectangle':
                env_map = TwoDMaps.rectangle_map(
                    *map(int, args.rectangle.split(',')))
            elif args.map_type == 'complex':
                env_map = TwoDMaps.complex_map(
                    *map(int, args.rectangle.split(',')))
            else:
                raise NotImplementedError()
            map_pool = [env_map]

        env = PursuitEvade(map_pool,
                           n_evaders=args.n_evaders,
                           n_pursuers=args.n_pursuers,
                           obs_range=args.obs_range,
                           n_catch=args.n_catch,
                           train_pursuit=args.train_pursuit,
                           urgency_reward=args.urgency,
                           surround=args.surround,
                           sample_maps=args.sample_maps,
                           constraint_window=args.constraint_window,
                           flatten=args.flatten,
                           reward_mech=args.reward_mech,
                           catchr=args.catchr,
                           term_pursuit=args.term_pursuit)

        env = TfEnv(
            RLLabEnv(StandardizedEnv(env,
                                     scale_reward=args.reward_scale,
                                     enable_obsnorm=False),
                     mode=args.control))

        if args.recurrent:
            if args.conv:
                feature_network = ConvNetwork(
                    name='feature_net',
                    input_shape=emv.spec.observation_space.shape,
                    output_dim=5,
                    conv_filters=(16, 32, 32),
                    conv_filter_sizes=(3, 3, 3),
                    conv_strides=(1, 1, 1),
                    conv_pads=('VALID', 'VALID', 'VALID'),
                    hidden_sizes=(64, ),
                    hidden_nonlinearity=tf.nn.relu,
                    output_nonlinearity=tf.nn.softmax)
            else:
                feature_network = MLP(
                    name='feature_net',
                    input_shape=(env.spec.observation_space.flat_dim +
                                 env.spec.action_space.flat_dim, ),
                    output_dim=5,
                    hidden_sizes=(256, 128, 64),
                    hidden_nonlinearity=tf.nn.tanh,
                    output_nonlinearity=None)
            if args.recurrent == 'gru':
                policy = CategoricalGRUPolicy(env_spec=env.spec,
                                              feature_network=feature_network,
                                              hidden_dim=int(
                                                  args.policy_hidden_sizes),
                                              name='policy')
            elif args.recurrent == 'lstm':
                policy = CategoricalLSTMPolicy(env_spec=env.spec,
                                               feature_network=feature_network,
                                               hidden_dim=int(
                                                   args.policy_hidden_sizes),
                                               name='policy')
        elif args.conv:
            feature_network = ConvNetwork(
                name='feature_net',
                input_shape=env.spec.observation_space.shape,
                output_dim=5,
                conv_filters=(8, 16),
                conv_filter_sizes=(3, 3),
                conv_strides=(2, 1),
                conv_pads=('VALID', 'VALID'),
                hidden_sizes=(32, ),
                hidden_nonlinearity=tf.nn.relu,
                output_nonlinearity=tf.nn.softmax)
            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          prob_network=feature_network)
        else:
            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          hidden_sizes=args.hidden_sizes)

    if args.baseline_type == 'linear':
        baseline = LinearFeatureBaseline(env_spec=env.spec)
    else:
        baseline = ZeroBaseline(env_spec=env.spec)

    # logger
    default_log_dir = config.LOG_DIR
    if args.log_dir is None:
        log_dir = osp.join(default_log_dir, args.exp_name)
    else:
        log_dir = args.log_dir
    tabular_log_file = osp.join(log_dir, args.tabular_log_file)
    text_log_file = osp.join(log_dir, args.text_log_file)
    params_log_file = osp.join(log_dir, args.params_log_file)

    logger.log_parameters_lite(params_log_file, args)
    logger.add_text_output(text_log_file)
    logger.add_tabular_output(tabular_log_file)
    prev_snapshot_dir = logger.get_snapshot_dir()
    prev_mode = logger.get_snapshot_mode()
    logger.set_snapshot_dir(log_dir)
    logger.set_snapshot_mode(args.snapshot_mode)
    logger.set_log_tabular_only(args.log_tabular_only)
    logger.push_prefix("[%s] " % args.exp_name)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.n_timesteps,
        max_path_length=args.max_traj_len,
        n_itr=args.n_iter,
        discount=args.discount,
        gae_lambda=args.gae_lambda,
        step_size=args.max_kl,
        optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
            base_eps=1e-5)) if args.recurrent else None,
        mode=args.control,
    )

    algo.train()
Exemplo n.º 13
0
    def parse_env_args(self, env, args):

        if isinstance(args, dict):
            args = to_named_tuple(args)

        # Multi-agent wrapper
        env = RLLabEnv(env, ma_mode=args.control)
        env = MATfEnv(env)

        # Policy
        if args.recurrent:
            if args.feature_net:
                feature_network = MLP(
                    name='feature_net',
                    input_shape=(env.spec.observation_space.flat_dim +
                                 env.spec.action_space.flat_dim, ),
                    output_dim=args.feature_output,
                    hidden_sizes=tuple(args.feature_hidden),
                    hidden_nonlinearity=tf.nn.tanh,
                    output_nonlinearity=None)
            elif args.conv:
                strides = tuple(args.conv_strides)
                chans = tuple(args.conv_channels)
                filts = tuple(args.conv_filters)

                assert len(strides) == len(chans) == len(
                    filts), "strides, chans and filts not equal"
                # only discrete actions supported, should be straightforward to extend to continuous
                assert isinstance(
                    env.spec.action_space,
                    Discrete), "Only discrete action spaces support conv"
                feature_network = ConvNetwork(
                    name='feature_net',
                    input_shape=env.spec.observation_space.shape,
                    output_dim=args.feature_output,
                    conv_filters=chans,
                    conv_filter_sizes=filts,
                    conv_strides=strides,
                    conv_pads=('VALID', ) * len(chans),
                    hidden_sizes=tuple(args.feature_hidden),
                    hidden_nonlinearity=tf.nn.relu,
                    output_nonlinearity=None)
            else:
                feature_network = None
            if args.recurrent == 'gru':
                if isinstance(env.spec.action_space, Box):
                    if args.control == 'concurrent':
                        policies = [
                            GaussianGRUPolicy(env_spec=env.spec,
                                              feature_network=feature_network,
                                              hidden_dim=int(
                                                  args.policy_hidden[0]),
                                              name='policy_{}'.format(agid))
                            for agid in range(len(env.agents))
                        ]
                    policy = GaussianGRUPolicy(env_spec=env.spec,
                                               feature_network=feature_network,
                                               hidden_dim=int(
                                                   args.policy_hidden[0]),
                                               name='policy')
                elif isinstance(env.spec.action_space, Discrete):
                    if args.control == 'concurrent':
                        policies = [
                            CategoricalGRUPolicy(
                                env_spec=env.spec,
                                feature_network=feature_network,
                                hidden_dim=int(args.policy_hidden[0]),
                                name='policy_{}'.format(agid),
                                state_include_action=False
                                if args.conv else True)
                            for agid in range(len(env.agents))
                        ]
                    q_network = CategoricalGRUPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden[0]),
                        name='q_network',
                        state_include_action=False if args.conv else True)
                    target_q_network = CategoricalGRUPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden[0]),
                        name='target_q_network',
                        state_include_action=False if args.conv else True)
                    policy = {
                        'q_network': q_network,
                        'target_q_network': target_q_network
                    }
                else:
                    raise NotImplementedError(env.spec.observation_space)

            elif args.recurrent == 'lstm':
                if isinstance(env.spec.action_space, Box):
                    if args.control == 'concurrent':
                        policies = [
                            GaussianLSTMPolicy(env_spec=env.spec,
                                               feature_network=feature_network,
                                               hidden_dim=int(
                                                   args.policy_hidden),
                                               name='policy_{}'.format(agid))
                            for agid in range(len(env.agents))
                        ]
                    policy = GaussianLSTMPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden),
                        name='policy')
                elif isinstance(env.spec.action_space, Discrete):
                    if args.control == 'concurrent':
                        policies = [
                            CategoricalLSTMPolicy(
                                env_spec=env.spec,
                                feature_network=feature_network,
                                hidden_dim=int(args.policy_hidden),
                                name='policy_{}'.format(agid))
                            for agid in range(len(env.agents))
                        ]
                    q_network = CategoricalLSTMPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden),
                        name='q_network')
                    target_q_network = CategoricalLSTMPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden),
                        name='target_q_network')
                    policy = {
                        'q_network': q_network,
                        'target_q_network': target_q_network
                    }
                else:
                    raise NotImplementedError(env.spec.action_space)

            else:
                raise NotImplementedError(args.recurrent)
        elif args.conv:
            strides = tuple(args.conv_strides)
            chans = tuple(args.conv_channels)
            filts = tuple(args.conv_filters)

            assert len(strides) == len(chans) == len(
                filts), "strides, chans and filts not equal"
            # only discrete actions supported, should be straightforward to extend to continuous
            assert isinstance(
                env.spec.action_space,
                Discrete), "Only discrete action spaces support conv"
            feature_network = ConvNetwork(
                name='feature_net',
                input_shape=env.spec.observation_space.shape,
                output_dim=env.spec.action_space.n,
                conv_filters=chans,
                conv_filter_sizes=filts,
                conv_strides=strides,
                conv_pads=(args.conv_pads, ) * len(chans),
                hidden_sizes=tuple(args.policy_hidden),
                hidden_nonlinearity=tf.nn.relu,
                output_nonlinearity=tf.nn.softmax,
                batch_normalization=args.batch_normalization)
            if args.algo == 'dqn':
                q_network = CategoricalMLPPolicy(name='q_network',
                                                 env_spec=env.spec,
                                                 prob_network=feature_network)
                target_q_network = CategoricalMLPPolicy(
                    name='target_q_network',
                    env_spec=env.spec,
                    prob_network=feature_network)
                policy = {
                    'q_network': q_network,
                    'target_q_network': target_q_network
                }

            else:
                policy = CategoricalMLPPolicy(name='policy',
                                              env_spec=env.spec,
                                              prob_network=feature_network)
        else:
            if env.spec is None:

                networks = [
                    DQNNetwork(i,
                               env,
                               target_network_update_freq=self.args.
                               target_network_update,
                               discount_factor=self.args.discount,
                               batch_size=self.args.batch_size,
                               learning_rate=self.args.qfunc_lr)
                    for i in range(env.n)
                ]

                policy = networks

            elif isinstance(env.spec.action_space, Box):
                policy = GaussianMLPPolicy(env_spec=env.spec,
                                           hidden_sizes=tuple(
                                               args.policy_hidden),
                                           min_std=args.min_std,
                                           name='policy')
            elif isinstance(env.spec.action_space, Discrete):
                policy = CategoricalMLPPolicy(env_spec=env.spec,
                                              hidden_sizes=tuple(
                                                  args.policy_hidden),
                                              name='policy')
            else:
                raise NotImplementedError(env.spec.action_space)

        return env, policy
Exemplo n.º 14
0
# env = TfEnv(normalize(gymenv))

# policy = GaussianMLPPolicy(
# name="policy",
# env_spec=env.spec,
# # The neural network policy should have two hidden layers, each with 32 hidden units.
# hidden_sizes=(100, 50, 25),
# hidden_nonlinearity=tf.nn.relu,
# )
"""
Use CategoricalMLPPolicy for GridWorld Environment
"""
policy = CategoricalMLPPolicy(
    name="policy",
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(100, 50, 25),
    hidden_nonlinearity=tf.nn.relu,
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=5000,
    max_path_length=2000,
    #max_path_length=env.horizon,
    n_itr=1000,
    discount=0.99,
def run_experiment(expert_rollout_pickle_path,
                   trained_policy_pickle_path,
                   env,
                   cost_trainer_type,
                   iterations=30,
                   num_frames=1,
                   traj_len=200,
                   config={}):

    # Load the expert rollouts into memory
    expert_rollouts = load_expert_rollouts(expert_rollout_pickle_path)

    # In the case that we only have one expert rollout in the file
    if type(expert_rollouts) is dict:
        expert_rollouts = [expert_rollouts]

    #TODO: make this configurable
    expert_rollouts = [
        shorten_tensor_dict(x, traj_len) for x in expert_rollouts
    ]

    # import pdb; pdb.set_trace()

    # Sanity check, TODO: should prune any "expert" rollouts with suboptimal reward?
    print("Average reward for expert rollouts: %f" %
          np.mean([np.sum(p['rewards']) for p in expert_rollouts]))

    if "transformers" in config and len(config["transformers"]) > 0:
        print("Transforming expert rollouts...")
        for rollout in tqdm(expert_rollouts):
            transformed_observations = []
            for ob in tqdm(rollout["observations"]):
                for transformer in config["transformers"]:
                    ob = transformer.transform(ob)
                transformed_observations.append(ob)
            rollout["observations"] = np.array(transformed_observations)

    # Handle both flattened state input and image input
    # TODO: this could be done better by looking at just the shape and determining from that
    if config["img_input"]:
        obs_dims = expert_rollouts[0]['observations'][0].shape
    else:
        # import pdb; pdb.set_trace()
        obs_dims = len(expert_rollouts[0]['observations'][0])

    if "num_novice_rollouts" in config:
        number_of_sample_trajectories = config["num_novice_rollouts"]
    else:
        number_of_sample_trajectories = len(expert_rollouts)

    print(number_of_sample_trajectories)

    # Choose a policy (Conv based on images, mlp based on states)
    # TODO: may also have to switch out categorical for something else in continuous state spaces??
    # Let's just avoid that for now?
    if config[
            "img_input"]:  # TODO: unclear right now if this even works ok. get poor results early on.
        policy = CategoricalConvPolicy(
            name="policy",
            env_spec=env.spec,
            conv_filters=[32, 64, 64],
            conv_filter_sizes=[3, 3, 3],
            conv_strides=[1, 1, 1],
            conv_pads=['SAME', 'SAME', 'SAME'],
            # The neural network policy should have two hidden layers, each with 100 hidden units each (see RLGAN paper)
            hidden_sizes=[200, 200])
    elif type(env.spec.action_space) == Discrete:
        policy = CategoricalMLPPolicy(
            name="policy",
            env_spec=env.spec,
            # The neural network policy should have two hidden layers, each with 100 hidden units each (see RLGAN paper)
            hidden_sizes=(400, 300))
    else:
        policy = GaussianMLPPolicy(name="policy",
                                   env_spec=env.spec,
                                   hidden_sizes=(100, 50, 25))

    if config["img_input"]:
        # TODO: right now the linear feature baseline is too computationally expensive to actually use
        # with full image inputs, so for now just use the zero baseline
        baseline = ZeroBaseline(env_spec=env.spec)
    else:
        baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=number_of_sample_trajectories *
        traj_len,  # This is actually used internally by the sampler. We make use of this sampler to generate our samples, hence we pass it here
        max_path_length=
        traj_len,  # same with this value. A cleaner way may be to create our own sampler, but for now doing it this way..
        n_itr=40,
        discount=0.995,
        step_size=0.01,
        optimizer=ConjugateGradientOptimizer(
            hvp_approach=FiniteDifferenceHvp(base_eps=1e-5),
            max_backtracks=40))

    # Prune the number of rollouts if that option is enabled
    if "num_expert_rollouts" in config:
        rollouts_to_use = min(config["num_expert_rollouts"],
                              len(expert_rollouts))
        expert_rollouts = expert_rollouts[:rollouts_to_use]
        print("Only using %d expert rollouts" % rollouts_to_use)

    true_rewards = []
    actual_rewards = []

    # Extract observations to a tensor
    expert_rollouts_tensor = tensor_utils.stack_tensor_list(
        [path["observations"] for path in expert_rollouts])

    if "oversample" in config and config["oversample"]:
        oversample_rate = max(
            int(number_of_sample_trajectories / len(expert_rollouts_tensor)),
            1.)
        expert_rollouts_tensor = expert_rollouts_tensor.repeat(oversample_rate,
                                                               axis=0)
        print("oversampling %d times to %d" %
              (oversample_rate, len(expert_rollouts_tensor)))

    with tf.Session() as sess:
        algo.start_worker()

        cost_trainer = cost_trainer_type([num_frames, obs_dims], config=config)

        trainer = Trainer(env=env,
                          sess=sess,
                          cost_approximator=cost_trainer,
                          cost_trainer=cost_trainer,
                          novice_policy=policy,
                          novice_policy_optimizer=algo,
                          num_frames=num_frames)
        sess.run(tf.global_variables_initializer())

        for iter_step in range(0, iterations):
            dump_data = (iter_step == (
                iterations -
                1)) and config["generate_option_graphs"]  # is last iteration
            true_reward, actual_reward = trainer.step(
                dump_datapoints=dump_data,
                config=config,
                expert_horizon=traj_len,
                number_of_sample_trajectories=number_of_sample_trajectories)
            true_rewards.append(true_reward)
            actual_rewards.append(actual_reward)

            # run a rollout for the video
            if "recording_env" in config:
                novice_rollouts = rollout_policy(policy,
                                                 config["recording_env"],
                                                 get_image_observations=False,
                                                 max_path_length=200)

        novice_rollouts = algo.obtain_samples(iter_step)

        rollout_rewards = [np.sum(x['rewards']) for x in novice_rollouts]

        print("Reward stats for final policy: %f +/- %f " %
              (np.mean(rollout_rewards), np.std(rollout_rewards)))
        # save the novice policy learned
        with open(trained_policy_pickle_path, "wb") as output_file:
            pickle.dump(policy, output_file)
        # TODO: also save the reward function?

        algo.shutdown_worker()

        second_true_rewards = []
        second_actual_rewards = []
        # Do our transfer learning task here:
        # TODO: move this to a separate script and save the learned weights
        if config['second_env'] is not None:
            with tf.variable_scope("second_policy"):
                #TODO: remove gross copypasta
                if not config["reset_second_policy"]:
                    second_policy = Serializable.clone(
                        policy)  # TODO: start with a fresh policy
                else:
                    if config[
                            "img_input"]:  # TODO: unclear right now if this even works ok. get poor results early on.
                        second_policy = CategoricalConvPolicy(
                            name="policy",
                            env_spec=config["second_env"].spec,
                            conv_filters=[32, 64, 64],
                            conv_filter_sizes=[3, 3, 3],
                            conv_strides=[1, 1, 1],
                            conv_pads=['SAME', 'SAME', 'SAME'],
                            # The neural network policy should have two hidden layers, each with 100 hidden units each (see RLGAN paper)
                            hidden_sizes=[200, 200])
                    elif type(env.spec.action_space) == Discrete:
                        second_policy = CategoricalMLPPolicy(
                            name="policy",
                            env_spec=config["second_env"].spec,
                            # The neural network policy should have two hidden layers, each with 100 hidden units each (see RLGAN paper)
                            hidden_sizes=(400, 300))
                    else:
                        second_policy = GaussianMLPPolicy(
                            name="policy",
                            env_spec=config["second_env"].spec,
                            hidden_sizes=(100, 50, 25))

                if config["img_input"]:
                    # TODO: right now the linear feature baseline is too computationally expensive to actually use
                    # with full image inputs, so for now just use the zero baseline
                    baseline = ZeroBaseline(env_spec=config["second_env"].spec)
                else:
                    baseline = LinearFeatureBaseline(
                        env_spec=config["second_env"].spec)

                algo = TRPO(
                    env=config["second_env"],
                    policy=second_policy,
                    baseline=baseline,
                    batch_size=number_of_sample_trajectories *
                    traj_len,  # This is actually used internally by the sampler. We make use of this sampler to generate our samples, hence we pass it here
                    max_path_length=
                    traj_len,  # same with this value. A cleaner way may be to create our own sampler, but for now doing it this way..
                    n_itr=40,
                    discount=0.995,
                    step_size=0.01,
                    optimizer=ConjugateGradientOptimizer(
                        hvp_approach=FiniteDifferenceHvp(base_eps=1e-5),
                        max_backtracks=40))

            if not config["stop_disc_training_on_second_run"] and config[
                    "use_prev_options_relearn_mixing_func"]:
                # If we're not retraining the discriminator at all in the transfer learning step,
                # just keep the old network
                options = cost_trainer.disc.discriminator_options
                cost_trainer.disc._remake_network_from_disc_options(
                    options,
                    stop_gradients=(not config["retrain_options"]),
                    num_extra_options=config["num_extra_options_on_transfer"])

            trainer = Trainer(
                env=config['second_env'],
                sess=sess,
                cost_approximator=cost_trainer,
                cost_trainer=cost_trainer,
                novice_policy=second_policy,
                novice_policy_optimizer=algo,
                num_frames=num_frames,
                train_disc=(not config["stop_disc_training_on_second_run"]))
            algo.start_worker()

            initialize_uninitialized(sess)
            for iter_step in range(0, iterations):
                # import pdb; pdb.set_trace()
                dump_data = (iter_step == (iterations - 1)) and config[
                    "generate_option_graphs"]  # is last iteration
                true_reward, actual_reward = trainer.step(
                    expert_rollouts_tensor=expert_rollouts_tensor,
                    dump_datapoints=dump_data,
                    config=config,
                    expert_horizon=traj_len,
                    number_of_sample_trajectories=number_of_sample_trajectories
                )
                second_true_rewards.append(true_reward)
                second_actual_rewards.append(actual_reward)

                # run a rollout for the video
                if "recording_env" in config:
                    novice_rollouts = rollout_policy(
                        second_policy,
                        config["recording_env"],
                        get_image_observations=False,
                        max_path_length=traj_len)

            novice_rollouts = algo.obtain_samples(iter_step)

            rollout_rewards = [np.sum(x['rewards']) for x in novice_rollouts]
            print("Reward stats for final policy: %f +/- %f " %
                  (np.mean(rollout_rewards), np.std(rollout_rewards)))
            # save the novice policy learned
            with open(trained_policy_pickle_path, "wb") as output_file:
                pickle.dump(second_policy, output_file)

            algo.shutdown_worker()

    return true_rewards, actual_rewards, second_true_rewards, second_actual_rewards
Exemplo n.º 16
0
    @property
    def observation_space(self):
        return Discrete(4)


if __name__ == "__main__":

    env = GridWorldEnv()

    from sandbox.rocky.tf.policies.categorical_mlp_policy import CategoricalMLPPolicy
    from sandbox.rocky.tf.core.network import MLP
    from sandbox.rocky.tf.envs.base import TfEnv
    from sandbox.rocky.tf.algos.trpo import TRPO
    import tensorflow as tf
    env = TfEnv(env)

    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline

    policy = CategoricalMLPPolicy(env_spec=env.spec, name="policy")

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                whole_paths=True,
                max_path_length=50,
                n_itr=40,
                discount=0.40)
    algo.train()
Exemplo n.º 17
0
            summary_writer=summary_writer,
            verbose=2)

        # build the policy
        latent_sampler = UniformlyRandomLatentSampler(
            scheduler=ConstantIntervalScheduler(k=scheduler_k),
            name='latent_sampler',
            dim=latent_dim)
        policy = CategoricalLatentVarMLPPolicy(policy_name="policy",
                                               latent_sampler=latent_sampler,
                                               env_spec=env.spec,
                                               hidden_sizes=(64, 64))
    else:
        # build the policy
        policy = CategoricalMLPPolicy(name="policy",
                                      env_spec=env.spec,
                                      hidden_sizes=(32, 32))
        recognition_model = None

    # build gail
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    reward_handler = hgail.misc.utils.RewardHandler(
        use_env_rewards=False,
        max_epochs=50,  # epoch at which final scales are used
        critic_final_scale=1.,
        recognition_initial_scale=0.)

    session.run(tf.global_variables_initializer())

    saver = tf.train.Saver(max_to_keep=100, keep_checkpoint_every_n_hours=.5)
    if initial_filepath:
Exemplo n.º 18
0
def get_policy(env, algo_name, info, policy_hidden_sizes,
               policy_hidden_nonlinearity, policy_output_nonlinearity,
               recurrent, **kwargs):
    policy = None
    policy_class = None
    hidden_sizes = get_hidden_sizes(policy_hidden_sizes)
    hidden_nonlinearity = get_nonlinearity(policy_hidden_nonlinearity)
    output_nonlinearity = get_nonlinearity(policy_output_nonlinearity)
    if algo_name in [
            'trpo',
            'actrpo',
            'acqftrpo',
            'qprop',
            'mqprop',
            'qfqprop',
            'trpg',
            'trpgoff',
            'nuqprop',
            'nuqfqprop',
            'nafqprop',
            'vpg',
            'qvpg',
            'dspg',
            'dspgoff',
    ]:
        if not info['is_action_discrete']:
            if recurrent:
                policy = GaussianLSTMPolicy(
                    name="gauss_lstm_policy",
                    env_spec=env.spec,
                    lstm_layer_cls=L.TfBasicLSTMLayer,
                    # gru_layer_cls=L.GRULayer,
                    output_nonlinearity=output_nonlinearity,  # None
                )
                policy_class = 'GaussianLSTMPolicy'
            else:
                policy = GaussianMLPPolicy(
                    name="gauss_policy",
                    env_spec=env.spec,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,  # tf.nn.tanh
                    output_nonlinearity=output_nonlinearity,  # None
                )
                policy_class = 'GaussianMLPPolicy'
        else:
            if recurrent:
                policy = CategoricalLSTMPolicy(
                    name="cat_lstm_policy",
                    env_spec=env.spec,
                    lstm_layer_cls=L.TfBasicLSTMLayer,
                    # gru_layer_cls=L.GRULayer,
                )
                policy_class = 'CategoricalLSTMPolicy'
            else:
                policy = CategoricalMLPPolicy(
                    name="cat_policy",
                    env_spec=env.spec,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,  # tf.nn.tanh
                )
                policy_class = 'CategoricalMLPPolicy'
    elif algo_name in [
            'ddpg',
    ]:
        assert not info['is_action_discrete']
        policy = DeterministicMLPPolicy(
            name="det_policy",
            env_spec=env.spec,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,  # tf.nn.relu
            output_nonlinearity=output_nonlinearity,  # tf.nn.tanh
        )
        policy_class = 'DeterministicMLPPolicy'
    print(
        '[get_policy] Instantiating %s, with sizes=%s, hidden_nonlinearity=%s.'
        % (policy_class, str(hidden_sizes), policy_hidden_nonlinearity))
    print('[get_policy] output_nonlinearity=%s.' %
          (policy_output_nonlinearity))
    return policy