Пример #1
0
def evaluate_policy(env, input_pol, exp, params, n_tests=100, render=False):
    H = params['min_steps']
    angle_dims = params['angle_dims']

    def gTrig(state):
        return utils.gTrig_np(state, angle_dims).flatten()

    def step_cb(*args, **kwargs):
        if render:
            env.render()

    results = []
    for i, p in enumerate(exp.policy_parameters):
        utils.print_with_stamp('Evaluating policy at iteration %d' % i)
        if p:
            input_pol.set_params(p)
            pol = input_pol
        else:
            pol = control.RandPolicy(maxU=input_pol.maxU)

        results_i = []
        for it in range(n_tests):
            ret = apply_controller(env,
                                   pol,
                                   H,
                                   preprocess=gTrig,
                                   callback=step_cb)
            results_i.append(ret)
        results.append(results_i)

    return results
Пример #2
0
def run_pilco_experiment(exp_setup=mcpilco_cartpole_experiment,
                         params=None,
                         loss_kwargs={},
                         polopt_kwargs={},
                         extra_inps=[],
                         step_cb=None,
                         polopt_cb=None,
                         learning_iteration_cb=None,
                         max_dataset_size=0,
                         render=False,
                         debug_plot=0):
    # setup experiment
    exp_objs = exp_setup(params)
    p0, env, pol, dyn, cost, exp, polopt, learner, params = exp_objs
    n_rnd = params.get('n_rnd', 1)
    n_opt = params.get('n_opt', 100)
    return_best = params.get('return_best', False)
    crn_dropout = params.get('crn_dropout', True)
    H = params.get('min_steps', 100)
    gamma = params.get('discount', 1.0)
    angle_dims = params.get('angle_dims', [])

    # init callbacks
    # callback executed after every call to env.step
    def step_cb_internal(state, action, cost, info):
        exp.add_sample(state, action, cost, info)
        if render:
            env.render()
        if callable(step_cb):
            step_cb(state, action, cost, info)

    def polopt_cb_internal(*args, **kwargs):
        if not crn_dropout:
            if hasattr(dyn, 'update'):
                dyn.update()
            if hasattr(pol, 'update'):
                pol.update()
        if callable(polopt_cb):
            polopt_cb(*args, **kwargs)

    # function to execute before applying policy
    def gTrig(state):
        return utils.gTrig_np(state, angle_dims).flatten()

    # collect experience with random controls
    randpol = control.RandPolicy(maxU=pol.maxU)
    for i in range(n_rnd):
        exp.new_episode()
        if n_rnd > 1 and i == n_rnd - 1:
            p = pol
            utils.print_with_stamp('Executing initial policy')
        else:
            p = randpol
            utils.print_with_stamp('Executing uniformly-random controls')
        apply_controller(env,
                         p,
                         H,
                         preprocess=gTrig,
                         callback=step_cb_internal)

    # 1. train dynamics once
    train_dynamics(dyn,
                   exp,
                   angle_dims=angle_dims,
                   max_dataset_size=max_dataset_size)

    # build loss function
    loss, inps, updts = learner.get_loss(pol, dyn, cost, angle_dims,
                                         **loss_kwargs)

    if debug_plot > 0:
        # build rollout function for plotting
        loss_kwargs['mm_state'] = False
        rollout_fn = learner.build_rollout(pol, dyn, cost, angle_dims,
                                           **loss_kwargs)
        fig, axarr = None, None

    # set objective of policy optimizer
    inps += extra_inps
    polopt.set_objective(loss, pol.get_params(symbolic=True), inps, updts,
                         **polopt_kwargs)

    # initial call so that the user gets the state before
    # the first learrning iteration
    if callable(learning_iteration_cb):
        learning_iteration_cb(exp, dyn, pol, polopt, params)

    if crn_dropout:
        utils.print_with_stamp('using common random numbers for dyn and pol',
                               'experiment_utils')
    else:
        utils.print_with_stamp('resampling weights for dyn and pol',
                               'experiment_utils')
    for i in range(n_opt):
        total_exp = sum([len(st) for st in exp.states])
        msg = '==== Iteration [%d], experience: [%d steps] ===='
        utils.print_with_stamp(msg % (i + 1, total_exp))
        if crn_dropout:
            if hasattr(dyn, 'update'):
                dyn.update()
            if hasattr(pol, 'update'):
                pol.update()

        # get initial state distribution (assumed gaussian)
        x0 = np.array([st[0] for st in exp.states])
        m0 = x0.mean(0)
        S0 = np.cov(x0, rowvar=False, ddof=1) +\
            1e-4*np.eye(x0.shape[1]) if len(x0) > 10 else p0.cov

        # 2. optimize policy
        polopt_args = [m0, S0, H, gamma]
        if isinstance(polopt, optimizers.SGDOptimizer):
            # check if we have a learning rate parameter
            lr = params.get('learning_rate', 1e-4)
            if callable(lr):
                lr = lr(i)
            polopt_args.append(lr)
        polopt.minimize(*polopt_args,
                        callback=polopt_cb_internal,
                        return_best=return_best)

        # 3. apply controller
        exp.new_episode(policy_params=pol.get_params(symbolic=False))
        apply_controller(env,
                         pol,
                         H,
                         preprocess=gTrig,
                         callback=step_cb_internal)
        # 4. train dynamics once
        train_dynamics(dyn,
                       exp,
                       angle_dims=angle_dims,
                       max_dataset_size=max_dataset_size)

        if callable(learning_iteration_cb):
            # user callback
            learning_iteration_cb(exp, dyn, pol, polopt, params)

        if debug_plot > 0:
            fig, axarr = plot_rollout(rollout_fn,
                                      exp,
                                      m0,
                                      S0,
                                      H,
                                      gamma,
                                      fig=fig,
                                      axarr=axarr)
    env.close()