def evaluate_policy(env, input_pol, exp, params, n_tests=100, render=False): H = params['min_steps'] angle_dims = params['angle_dims'] def gTrig(state): return utils.gTrig_np(state, angle_dims).flatten() def step_cb(*args, **kwargs): if render: env.render() results = [] for i, p in enumerate(exp.policy_parameters): utils.print_with_stamp('Evaluating policy at iteration %d' % i) if p: input_pol.set_params(p) pol = input_pol else: pol = control.RandPolicy(maxU=input_pol.maxU) results_i = [] for it in range(n_tests): ret = apply_controller(env, pol, H, preprocess=gTrig, callback=step_cb) results_i.append(ret) results.append(results_i) return results
def run_pilco_experiment(exp_setup=mcpilco_cartpole_experiment, params=None, loss_kwargs={}, polopt_kwargs={}, extra_inps=[], step_cb=None, polopt_cb=None, learning_iteration_cb=None, max_dataset_size=0, render=False, debug_plot=0): # setup experiment exp_objs = exp_setup(params) p0, env, pol, dyn, cost, exp, polopt, learner, params = exp_objs n_rnd = params.get('n_rnd', 1) n_opt = params.get('n_opt', 100) return_best = params.get('return_best', False) crn_dropout = params.get('crn_dropout', True) H = params.get('min_steps', 100) gamma = params.get('discount', 1.0) angle_dims = params.get('angle_dims', []) # init callbacks # callback executed after every call to env.step def step_cb_internal(state, action, cost, info): exp.add_sample(state, action, cost, info) if render: env.render() if callable(step_cb): step_cb(state, action, cost, info) def polopt_cb_internal(*args, **kwargs): if not crn_dropout: if hasattr(dyn, 'update'): dyn.update() if hasattr(pol, 'update'): pol.update() if callable(polopt_cb): polopt_cb(*args, **kwargs) # function to execute before applying policy def gTrig(state): return utils.gTrig_np(state, angle_dims).flatten() # collect experience with random controls randpol = control.RandPolicy(maxU=pol.maxU) for i in range(n_rnd): exp.new_episode() if n_rnd > 1 and i == n_rnd - 1: p = pol utils.print_with_stamp('Executing initial policy') else: p = randpol utils.print_with_stamp('Executing uniformly-random controls') apply_controller(env, p, H, preprocess=gTrig, callback=step_cb_internal) # 1. train dynamics once train_dynamics(dyn, exp, angle_dims=angle_dims, max_dataset_size=max_dataset_size) # build loss function loss, inps, updts = learner.get_loss(pol, dyn, cost, angle_dims, **loss_kwargs) if debug_plot > 0: # build rollout function for plotting loss_kwargs['mm_state'] = False rollout_fn = learner.build_rollout(pol, dyn, cost, angle_dims, **loss_kwargs) fig, axarr = None, None # set objective of policy optimizer inps += extra_inps polopt.set_objective(loss, pol.get_params(symbolic=True), inps, updts, **polopt_kwargs) # initial call so that the user gets the state before # the first learrning iteration if callable(learning_iteration_cb): learning_iteration_cb(exp, dyn, pol, polopt, params) if crn_dropout: utils.print_with_stamp('using common random numbers for dyn and pol', 'experiment_utils') else: utils.print_with_stamp('resampling weights for dyn and pol', 'experiment_utils') for i in range(n_opt): total_exp = sum([len(st) for st in exp.states]) msg = '==== Iteration [%d], experience: [%d steps] ====' utils.print_with_stamp(msg % (i + 1, total_exp)) if crn_dropout: if hasattr(dyn, 'update'): dyn.update() if hasattr(pol, 'update'): pol.update() # get initial state distribution (assumed gaussian) x0 = np.array([st[0] for st in exp.states]) m0 = x0.mean(0) S0 = np.cov(x0, rowvar=False, ddof=1) +\ 1e-4*np.eye(x0.shape[1]) if len(x0) > 10 else p0.cov # 2. optimize policy polopt_args = [m0, S0, H, gamma] if isinstance(polopt, optimizers.SGDOptimizer): # check if we have a learning rate parameter lr = params.get('learning_rate', 1e-4) if callable(lr): lr = lr(i) polopt_args.append(lr) polopt.minimize(*polopt_args, callback=polopt_cb_internal, return_best=return_best) # 3. apply controller exp.new_episode(policy_params=pol.get_params(symbolic=False)) apply_controller(env, pol, H, preprocess=gTrig, callback=step_cb_internal) # 4. train dynamics once train_dynamics(dyn, exp, angle_dims=angle_dims, max_dataset_size=max_dataset_size) if callable(learning_iteration_cb): # user callback learning_iteration_cb(exp, dyn, pol, polopt, params) if debug_plot > 0: fig, axarr = plot_rollout(rollout_fn, exp, m0, S0, H, gamma, fig=fig, axarr=axarr) env.close()