def adabatch(env, policy, horizon, batchsize = 100, iterations = 1000, gamma = 0.99, rmax = 1., phimax = 1., safety_requirement = MonotonicImprovement(), test_det = True, render = False, seed = None, baseline = 'peters', action_filter = None, parallel = False, n_jobs = 4, logger = Logger(name='test_sunday'), save_params = 1000, log_params = True, verbose = True): """ Only for SIMPLE Gaussian policy w/ scalar variance """ # Defaults assert policy.learn_std if action_filter is None: action_filter = clip(env) # Seeding agent if seed is not None: seed_all_agent(seed) # Preparing logger algo_info = {'Algorithm': 'ADASTEP', 'Environment': str(env), 'BatchSize': batchsize, 'Max horizon': horizon, 'Iterations': iterations, 'gamma': gamma, 'actionFilter': action_filter, 'rmax': rmax, 'phimax': phimax} logger.write_info({**algo_info, **policy.info()}) log_keys = ['Perf', 'UPerf', 'AvgHorizon', 'Alpha', 'BatchSize', 'Exploration', 'ThetaGradNorm', 'Penalty', 'Coordinate'] if log_params: log_keys += ['param%d' % i for i in range(policy.num_params())] if test_det: log_keys.append('DetPerf') log_row = dict.fromkeys(log_keys) logger.open(log_row.keys()) # Learning avol = torch.tensor(env.action_space.high - env.action_space.low).item() it = 0 while(it < iterations): # Begin iteration if verbose: print('\nIteration ', it) if verbose: print('Params: ', policy.get_flat()) # Test if test_det: omega = policy.get_scale_params() policy.set_scale_params(-100.) batch = generate_batch(env, policy, horizon, 1, action_filter) policy.set_scale_params(omega) log_row['DetPerf'] = performance(batch, gamma) if render: generate_batch(env, policy, horizon, 1, action_filter, render=True) omega = policy.get_scale_params() sigma = torch.exp(omega).item() batch = generate_batch(env, policy, horizon, batchsize, action_filter, parallel=parallel, n_jobs=n_jobs, seed=seed) grad = simple_gpomdp_estimator(batch, gamma, policy, baseline) theta_grad = grad[1:] norminf = torch.max(torch.abs(theta_grad)) k = torch.argmax(torch.abs(theta_grad)) penalty = rmax * phimax**2 / (1-gamma)**2 * (avol / (sigma * math.sqrt(2*math.pi)) + gamma / (2*(1-gamma))) alpha_star = sigma ** 2/ (2 * penalty) Cmax = alpha_star * norminf*2 / 2 C = safety_requirement.next() alpha = alpha_star * (1 + math.sqrt(1 - C / (Cmax + 1e-12) + 1e-12)) theta = policy.get_loc_params() new_theta = theta new_theta[k] += alpha * theta_grad[k] policy.set_loc_params(new_theta) # Log log_row['Coordinate'] = k.item() log_row['Alpha'] = alpha log_row['Penalty'] = penalty log_row['ThetaGradNorm'] = torch.norm(theta_grad).item() log_row['BatchSize'] = batchsize log_row['Exploration'] = policy.exploration() log_row['Alpha'] = alpha log_row['Perf'] = performance(batch, gamma) log_row['UPerf'] = performance(batch, 1.) log_row['AvgHorizon'] = avg_horizon(batch) params = policy.get_flat() if log_params: for i in range(policy.num_params()): log_row['param%d' % i] = params[i].item() logger.write_row(log_row, it) if save_params and it % save_params == 0: logger.save_params(params, it) # Next iteration it += 1 # Final policy if save_params: logger.save_params(params, it)
def main(seed=None, alpha=0.05, logsigma=-2.): gamma = 0.99 env = gym.make('ComplexMiniGolf-v0') env.sigma_noise = 0 env.gamma = gamma state_dim = sum( env.observation_space.shape) # dimensionality of the state space action_dim = sum( env.action_space.shape) # dimensionality of the action space print(state_dim, action_dim) horizon = 20 # maximum length of a trajectory mu_init = torch.tensor([1., 1., 1., 1.]) log_std_init = torch.tensor([logsigma]) policy = RadialBasisPolicy( state_dim, #input size action_dim, #output size mu_init=mu_init, #initial mean parameters feature_fun=feature_function, logstd_init=log_std_init, learn_std=True) stepper = ConstantStepper(alpha) batchsize = 500 log_dir = '../../../logs' log_name = 'REINFORCE' logger = Logger(directory=log_dir, name=log_name) if seed is None: seed = 42 env.seed(seed) init_par = [log_std_init, mu_init] init_ten = torch.cat(init_par, 0) # Reset the policy (in case is run multiple times) policy.set_from_flat(init_ten) stats, performance = reinforce2( alpha, logsigma, env=env, policy=policy, horizon=horizon, stepper=stepper, batchsize=batchsize, disc=gamma, iterations=700, seed=seed, logger=logger, save_params= 5, #Policy parameters will be saved on disk each 5 iterations shallow=True, #Use optimized code for shallow policies estimator='gpomdp', #Use the G(PO)MDP refined estimator baseline='peters' #Use Peter's variance-minimizing baseline ) policy.get_flat() return stats, performance
def sepg(env, policy, horizon, batchsize = 100, iterations = 1000, gamma = 0.99, rmax = 1., phimax = 1., safety_requirement = 'mi', delta = 1., confidence_schedule = None, clip_at = 100, test_batchsize = False, render = False, seed = None, baseline = 'peters', shallow = True, action_filter = None, parallel = False, logger = Logger(name='SEPG'), save_params = 1000, log_params = True, verbose = True): """ Only for SIMPLE Gaussian policy w/ scalar variance Policy must have learn_std = False, as std is META-learned """ #Defaults assert policy.learn_std if action_filter is None: action_filter = clip(env) #Seed agent if seed is not None: seed_all_agent(seed) #Prepare logger algo_info = {'Algorithm': 'SEPG', 'Environment': str(env), 'BatchSize': batchsize, 'Max horizon': horizon, 'Iterations': iterations, 'gamma': gamma, 'actionFilter': action_filter, 'rmax': rmax, 'phimax': phimax} logger.write_info({**algo_info, **policy.info()}) log_keys = ['Perf', 'UPerf', 'AvgHorizon', 'Alpha', 'BatchSize', 'Exploration', 'Eta', 'ThetaGradNorm', 'OmegaGrad', 'OmegaMetagrad', 'Penalty', 'MetaPenalty', 'IterationKind', 'ThetaGradNorm', 'Eps', 'Up', 'Down', 'C', 'Cmax', 'Delta'] #0: theta, 1: omega if log_params: log_keys += ['param%d' % i for i in range(policy.num_params())] if test_batchsize: log_keys.append('DetPerf') log_row = dict.fromkeys(log_keys) logger.open(log_row.keys()) #Safety requirements if safety_requirement == 'mi': thresholder = MonotonicImprovement() elif safety_requirement == 'budget': batch = generate_batch(env, policy, horizon, batchsize, action_filter) thresholder = Budget(performance(batch, gamma)) else: thresholder = FixedThreshold(float(safety_requirement)) #Learning loop omega_grad = float('nan') omega_metagrad = float('nan') metapenalty = float('nan') eta = float('nan') it = 0 while(it < iterations): #Begin iteration if verbose: print('\nIteration ', it) if verbose: print('Params: ', policy.get_flat()) #Test mean parameters on deterministic policy if test_batchsize: test_batch = generate_batch(env, policy, horizon, test_batchsize, action_filter=action_filter, seed=seed, njobs=parallel, deterministic=True) log_row['DetPerf'] = performance(test_batch, gamma) #Render behavior if render: generate_batch(env, policy, horizon, 1, action_filter, render=True) # if it % 2 == 0: #Std update omega = policy.get_scale_params() sigma = torch.exp(omega).item() batch = generate_batch(env, policy, horizon, batchsize, action_filter=action_filter, njobs=parallel, seed=seed) if confidence_schedule is not None: delta = confidence_schedule.next(it) log_row['Delta'] = delta if delta <1: grad, grad_var = simple_gpomdp_estimator(batch, gamma, policy, baseline, result='moments') omega_grad = grad[0] omega_grad_var = grad_var[0] omega_metagrad, omega_metagrad_var = metagrad(batch, gamma, policy, alpha, clip_at, baseline, result='moments') quant = 2 * sts.t.interval(1 - delta, batchsize-1,loc=0.,scale=1.)[1] eps = torch.tensor(quant * torch.sqrt(omega_grad_var / batchsize), dtype=torch.float) log_row['Eps'] = torch.norm(eps).item() metaeps = torch.tensor(quant * torch.sqrt(omega_metagrad_var / batchsize), dtype=torch.float) if torch.sign(omega_grad).item() >= 0 and torch.sign(omega_metagrad).item() >= 0: up = torch.clamp(torch.abs(omega_grad - eps), min=0.) * torch.clamp(torch.abs(omega_metagrad - metaeps), min=0.) elif torch.sign(omega_grad).item() >= 0 and torch.sign(omega_metagrad).item() < 0: up = (omega_grad + eps) * (omega_metagrad - metaeps) elif torch.sign(omega_grad).item() < 0 and torch.sign(omega_metagrad).item() >=0: up = (omega_grad - eps) * (omega_metagrad + eps) else: up = torch.abs(omega_grad + eps) * torch.abs(omega_metagrad + metaeps) down = omega_metagrad + metaeps * torch.sign(omega_metagrad) log_row['Up'] = up.item() log_row['Down'] = down.item() metapenalty = rmax / (1 - gamma)**2 * (0.53 * avol / (2 * sigma) + gamma / (1 - gamma)) eta_star = (up / (2 * metapenalty * down**2 + 1e-12)).item() Cmax = up**2 / (4 * metapenalty * down**2).item() else: log_row['Eps'] = 0 grad = gpomdp_estimator(batch, gamma, policy, baselinekind=baseline, shallow=shallow) theta_grad = grad[1:] omega_grad = grad[0] #-> mixed, _ = mixed_estimator(batch, gamma, policy, baseline, theta_grad) norm_grad = 2 * theta_grad.dot(mixed) A = omega_grad B = 2 * alpha * torch.norm(theta_grad)**2 C = sigma * alpha * norm_grad C = torch.clamp(C, min=-clip_at, max=clip_at) omega_metagrad = A + B + C metapenalty = rmax / (1 - gamma)**2 * (0.53 * avol / (2 * sigma) + gamma / (1 - gamma)) eta_star = (omega_grad / (2 * metapenalty * omega_metagrad) + 1e-12).item() Cmax = (omega_grad ** 2 / (4 * metapenalty)).item() log_row['Up'] = torch.tensor(omega_grad).item() log_row['Down'] = torch.tensor(omega_metagrad).item() perf = performance(batch, gamma) Co = thresholder.next(perf) Co = min(Co, Cmax) log_row['C'] = Co log_row['Cmax'] = Cmax eta = eta_star + abs(eta_star) * math.sqrt(1 - Co / (Cmax + 1e-12) + 1e-12) new_omega = omega + eta * omega_metagrad policy.set_scale_params(new_omega) ### else: #Mean update omega = policy.get_scale_params() sigma = torch.exp(omega).item() batch = generate_batch(env, policy, horizon, batchsize, action_filter=action_filter, n_jobs=parallel, seed=seed) if confidence_schedule is not None: delta = confidence_schedule.next(it) log_row['Delta'] = delta if delta < 1: grad, grad_var = simple_gpomdp_estimator(batch, gamma, policy, baseline, result='moments') theta_grad = grad[1:] theta_grad_var = grad_var[1:] quant = 2*sts.t.interval(1 - delta, batchsize-1,loc=0.,scale=1.)[1] eps = quant * torch.sqrt(theta_grad_var / batchsize) log_row['Eps'] = torch.norm(eps).item() norm2 = torch.norm(torch.clamp(torch.abs(theta_grad) - eps, min=0.)) norm1 = torch.sum(torch.abs(theta_grad) + eps) log_row['Up'] = norm1.item() log_row['Down'] = norm2.item() else: log_row['Eps'] = 0 grad = simple_gpomdp_estimator(batch, gamma, policy, baseline) theta_grad = grad[1:] norm2 = torch.norm(theta_grad) norm1 = torch.sum(torch.abs(theta_grad)) log_row['Up'] = norm1.item() log_row['Down'] = norm2.item() penalty = rmax * phimax**2 / (1-gamma)**2 * (avol / (sigma * math.sqrt(2*math.pi)) + gamma / (2*(1-gamma))) alpha_star = sigma ** 2 * norm2 ** 2 / (2 * penalty * norm1 ** 2 + 1e-12) Cmax = (alpha_star * norm2**2 / 2).item() perf = performance(batch, gamma) Co = thresholder.next(perf) Co = min(Co, Cmax) log_row['C'] = Co log_row['Cmax'] = Cmax alpha = alpha_star * (1 + math.sqrt(1 - Co / (Cmax + 1e-12) + 1e-12)) theta = policy.get_loc_params() new_theta = theta + alpha * theta_grad policy.set_loc_params(new_theta) ### # Log log_row['IterationKind'] = it % 2 log_row['ThetaGradNorm'] = torch.norm(theta_grad).item() log_row['Alpha'] = alpha log_row['Eta'] = eta log_row['Penalty'] = penalty log_row['MetaPenalty'] = metapenalty log_row['OmegaGrad'] = torch.tensor(omega_grad).item() log_row['OmegaMetagrad'] = torch.tensor(omega_metagrad).item() log_row['ThetaGradNorm'] = torch.norm(theta_grad).item() log_row['BatchSize'] = batchsize log_row['Exploration'] = policy.exploration() log_row['Alpha'] = alpha.item() log_row['Perf'] = perf log_row['UPerf'] = performance(batch, 1.) log_row['AvgHorizon'] = avg_horizon(batch) params = policy.get_flat() if log_params: for i in range(policy.num_params()): log_row['param%d' % i] = params[i].item() logger.write_row(log_row, it) if save_params and it % save_params == 0: logger.save_params(params, it) # Next iteration it += 1 # Final policy if save_params: logger.save_params(params, it)
def adastep(env, policy, horizon, batchsize = 100, iterations = 1000, gamma = 0.99, rmax = 1., phimax = 1., greedy = True, delta = 1., test_det = True, render = False, seed = None, baseline = 'peters', action_filter = None, parallel = False, n_jobs = 4, logger = Logger(name='test_sunday'), save_params = 1000, log_params = True, verbose = True): """ Only for SIMPLE Gaussian policy w/ scalar variance Policy must have learn_std = False, as std is META-learned """ # Defaults assert policy.learn_std if action_filter is None: action_filter = clip(env) # Seeding agent if seed is not None: seed_all_agent(seed) # Preparing logger algo_info = {'Algorithm': 'ADASTEP', 'Environment': str(env), 'BatchSize': batchsize, 'Max horizon': horizon, 'Iterations': iterations, 'gamma': gamma, 'actionFilter': action_filter, 'rmax': rmax, 'phimax': phimax, 'greedy': greedy} logger.write_info({**algo_info, **policy.info()}) log_keys = ['Perf', 'UPerf', 'AvgHorizon', 'Alpha', 'BatchSize', 'Exploration', 'ThetaGradNorm', 'Penalty'] if log_params: log_keys += ['param%d' % i for i in range(policy.num_params())] if test_det: log_keys.append('DetPerf') log_row = dict.fromkeys(log_keys) logger.open(log_row.keys()) # Learning avol = torch.tensor(env.action_space.high - env.action_space.low).item() it = 0 while(it < iterations): # Begin iteration if verbose: print('\nIteration ', it) if verbose: print('Params: ', policy.get_flat()) # Test if test_det: omega = policy.get_scale_params() policy.set_scale_params(-100.) batch = generate_batch(env, policy, horizon, 1, action_filter) policy.set_scale_params(omega) log_row['DetPerf'] = performance(batch, gamma) if render: generate_batch(env, policy, horizon, 1, action_filter, render=True) omega = policy.get_scale_params() sigma = torch.exp(omega).item() batch = generate_batch(env, policy, horizon, batchsize, action_filter, parallel=parallel, n_jobs=n_jobs, seed=seed) if delta < 1: grad, grad_var = simple_gpomdp_estimator(batch, gamma, policy, baseline, result='moments') theta_grad = grad[1:] theta_grad_var = grad_var[1:] quant = 2*sts.t.interval(1 - delta, batchsize-1,loc=0.,scale=1.)[1] eps = quant * torch.sqrt(theta_grad_var / batchsize + 1e-12) norm2 = torch.norm(torch.clamp(torch.abs(theta_grad) - eps, min=0.)) norm1 = torch.sum(torch.abs(theta_grad) + eps) else: grad = simple_gpomdp_estimator(batch, gamma, policy, baseline) theta_grad = grad[1:] norm2 = torch.norm(theta_grad) norm1 = torch.sum(torch.abs(theta_grad)) penalty = rmax * phimax**2 / (1-gamma)**2 * (avol / (sigma * math.sqrt(2*math.pi)) + gamma / (2*(1-gamma))) alpha_star = sigma ** 2 * norm2 ** 2 / (2 * penalty * norm1 ** 2 + 1e-12) Cmax = alpha_star * norm2**2 / 2 if greedy: C = Cmax else: C = 0 alpha = alpha_star * (1 + math.sqrt(1 - C / (Cmax + 1e-12) + 1e-12)) theta = policy.get_loc_params() new_theta = theta + alpha * theta_grad policy.set_loc_params(new_theta) # Log log_row['Alpha'] = alpha log_row['Penalty'] = penalty log_row['ThetaGradNorm'] = torch.norm(theta_grad).item() log_row['BatchSize'] = batchsize log_row['Exploration'] = policy.exploration() log_row['Alpha'] = alpha.item() log_row['Perf'] = performance(batch, gamma) log_row['UPerf'] = performance(batch, 1.) log_row['AvgHorizon'] = avg_horizon(batch) params = policy.get_flat() if log_params: for i in range(policy.num_params()): log_row['param%d' % i] = params[i].item() logger.write_row(log_row, it) if save_params and it % save_params == 0: logger.save_params(params, it) # Next iteration it += 1 # Final policy if save_params: logger.save_params(params, it)
def reinforce2(alpha, logsig, env, policy, horizon, *, batchsize=100, iterations=1000, disc=0.99, stepper=ConstantStepper(1e-2), action_filter=None, estimator='gpomdp', baseline='avg', logger=Logger(name='gpomdp'), shallow=False, seed=None, test_batchsize=False, info_key='danger', save_params=100, log_params=False, log_grad=False, parallel=False, render=False, verbose=1): """ REINFORCE/G(PO)MDP algorithmn env: environment policy: the one to improve horizon: maximum task horizon batchsize: number of trajectories used to estimate policy gradient iterations: number of policy updates disc: discount factor stepper: step size criterion. A constant step size is used by default action_filter: function to apply to the agent's action before feeding it to the environment, not considered in gradient estimation. By default, the action is clipped to satisfy evironmental boundaries estimator: either 'reinforce' or 'gpomdp' (default). The latter typically suffers from less variance baseline: control variate to be used in the gradient estimator. Either 'avg' (average reward, default), 'peters' (variance-minimizing) or 'zero' (no baseline) logger: for human-readable logs (standard output, csv, tensorboard...) shallow: whether to employ pre-computed score functions (only available for shallow policies) seed: random seed (None for random behavior) test_batchsize: number of test trajectories used to evaluate the corresponding deterministic policy at each iteration. If 0 or False, no test is performed save_params: how often (every x iterations) to save the policy parameters to disk. Final parameters are always saved for x>0. If False, they are never saved. log_params: whether to include policy parameters in the human-readable logs log_grad: whether to include gradients in the human-readable logs parallel: number of parallel jobs for simulation. If 0 or False, sequential simulation is performed. render: how often (every x iterations) to render the agent's behavior on a sample trajectory. If False, no rendering happens verbose: level of verbosity (0: only logs; 1: normal; 2: maximum) """ # Defaults if action_filter is None: action_filter = clip(env) # Seed agent if seed is not None: seed_all_agent(seed) # Prepare logger algo_info = {'Algorithm': 'REINFORCE', 'Estimator': estimator, 'Baseline': baseline, 'Env': str(env), 'Horizon': horizon, 'BatchSize': batchsize, 'Disc': disc, 'StepSizeCriterion': str(stepper), 'Seed': seed, } logger.write_info({**algo_info, **policy.info()}) log_keys = ['Perf', 'UPerf', 'AvgHorizon', 'StepSize', 'GradNorm', 'Time', 'StepSize', 'Exploration', 'Info'] if log_params: log_keys += ['param%d' % i for i in range(policy.num_params())] if log_grad: log_keys += ['grad%d' % i for i in range(policy.num_params())] if test_batchsize: log_keys += ['TestPerf', 'TestPerf', 'TestInfo'] log_row = dict.fromkeys(log_keys) logger.open(log_row.keys()) # init image & csv filename = "../csv/minigolf/REINFORCE/ALPHA={}/LOGSTD={}/data{}.csv".format(alpha, logsig, seed) os.makedirs(os.path.dirname(filename), exist_ok=True) data_file = open(filename, mode='w') file_writer = csv.writer(data_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) visualizer = MGVisualizer("MG visualizer", "/minigolf/REINFORCE/ALPHA={}/LOGSTD={}/test{}.png".format(alpha, logsig, seed)) visualizer.clean_panels() # PLOTTER INFO stats = {} stats['w1'] = [] stats['w2'] = [] stats['w3'] = [] stats['w4'] = [] stats['j'] = [] stats['fail'] = [] # ------------ # Learning loop it = 0 cumulative_fail = 0 cumulative_j = 0 while it < iterations: # Begin iteration start = time.time() if verbose: print('\nIteration ', it) params = policy.get_flat() if verbose > 1: print('Parameters:', params) # Test the corresponding deterministic policy if test_batchsize: test_batch = generate_batch(env, policy, horizon, test_batchsize, action_filter=action_filter, seed=seed, njobs=parallel, deterministic=True, key=info_key) log_row['TestPerf'] = performance(test_batch, disc) log_row['TestInfo'] = mean_sum_info(test_batch).item() log_row['UTestPerf'] = performance(test_batch, 1) # Render the agent's behavior if render and it % render == 0: generate_batch(env, policy, horizon, episodes=1, action_filter=action_filter, render=True, key=info_key) # Collect trajectories batch = generate_batch(env, policy, horizon, batchsize, action_filter=action_filter, seed=seed, n_jobs=parallel, key=info_key) # ------------------- count fails ------------------- rewards = [b[2] for b in batch] failures = [np.count_nonzero(r==-100) for r in rewards] cumulative_fail += sum(failures) # --------------------------------------------------- perf = performance(batch, disc) cumulative_j += perf log_row['Perf'] = perf log_row['Info'] = mean_sum_info(batch).item() log_row['UPerf'] = performance(batch, disc=1.) log_row['AvgHorizon'] = avg_horizon(batch) log_row['Exploration'] = policy.exploration().item() log_row['IterationFails'] = sum(failures) log_row['CumulativeFails'] = cumulative_fail # Estimate policy gradient if estimator == 'gpomdp': grad = gpomdp_estimator(batch, disc, policy, baselinekind=baseline, shallow=shallow) elif estimator == 'reinforce': grad = reinforce_estimator(batch, disc, policy, baselinekind=baseline, shallow=shallow) else: raise ValueError('Invalid policy gradient estimator') if verbose > 1: print('Gradients: ', grad) log_row['GradNorm'] = torch.norm(grad).item() # Select meta-parameters stepsize = stepper.next(grad) log_row['StepSize'] = torch.norm(torch.tensor(stepsize)).item() # Update policy parameters new_params = params + stepsize * grad policy.set_from_flat(new_params) # Log log_row['Time'] = time.time() - start if log_params: for i in range(policy.num_params()): log_row['param%d' % i] = params[i].item() if log_grad: for i in range(policy.num_params()): log_row['grad%d' % i] = grad[i].item() logger.write_row(log_row, it) # Save parameters if save_params and it % save_params == 0: logger.save_params(params, it) print(new_params) params = new_params.numpy()[1:] # updated w # update csv & image visualizer.show_values(params, perf, cumulative_fail) file_writer.writerow([params[0], params[1], params[2], params[3], cumulative_fail, perf]) # PLOTTER INFO # if it % 10 == 0: stats['w1'].append(params[0]) stats['w2'].append(params[1]) stats['w3'].append(params[2]) stats['w4'].append(params[3]) stats['j'].append(perf) stats['fail'].append(cumulative_fail) # ------------ # Next iteration it += 1 # Save final parameters if save_params: logger.save_params(params, it) visualizer.save_image() # Cleanup logger.close() return stats, cumulative_j
mu_init = torch.zeros(m) logstd_init = torch.log(torch.zeros(1) + args.std_init) policy = ShallowGaussianPolicy(m, d, mu_init=mu_init, logstd_init=logstd_init, learn_std=args.learnstd) test_batchsize = args.min_batchsize if args.test else 0 envname = re.sub(r'[^a-zA-Z]', "", args.env)[:-1] envname = re.sub(r'[^a-zA-Z]', "", args.env)[:-1].lower() logname = envname + '_' + args.name + '_' + str(args.seed) if args.temp: logger = Logger(directory='../temp', name=logname) else: logger = Logger(directory='../logs', name=logname) #Constants _, kappa, _ = gauss_smooth_const(args.max_feat, args.std_init) lip_const = gauss_lip_const(args.max_feat, args.max_rew, args.disc, args.std_init) if args.estimator == 'reinforce': var_bound = reinforce_var_bound(args.max_rew, args.disc, kappa, args.horizon) elif args.estimator == 'gpomdp': var_bound = gpomdp_var_bound(args.max_rew, args.disc, kappa, args.horizon) else: raise NotImplementedError
def adabatch(env, policy, horizon, pen_coeff, *, bound='chebyshev', var_bound=None, grad_range=None, conf=0.2, min_batchsize=32, max_batchsize=10000, iterations=float('inf'), max_samples=1e6, disc=0.9, action_filter=None, estimator='gpomdp', baseline='peters', logger=Logger(name='AdaBatch'), shallow=True, meta_conf=0.05, seed=None, test_batchsize=False, info_key='danger', save_params=100, log_params=True, log_grad=False, parallel=False, render=False, verbose=1): """ Safe PG algorithm from "Adaptive Batch Size for Safe Policy Gradients", Papini et al., 2017. Only for Gaussian policies. env: environment policy: the one to improve horizon: maximum task horizon pen_coeff: penalty coefficient for policy update bound: statistical inequality used to determine optimal batchsize (chebyshev/student/hoeffding/bernstein) var_bound: upper bound on the variance of the PG estimator. Must not be None if Chebyshev's bound is employed grad_range: theoretical range of gradient estimate. If none, it is estimated from data (in a biased way) conf: probability of failure min_batchsize: minimum number of trajectories to estimate policy gradient max_batchsize: maximum number of trajectories to estimate policy gradient iterations: number of policy updates max_samples: maximum number of total trajectories disc: discount factor action_filter: function to apply to the agent's action before feeding it to the environment, not considered in gradient estimation. By default, the action is clipped to satisfy evironmental boundaries estimator: either 'reinforce' or 'gpomdp' (default). The latter typically suffers from less variance baseline: control variate to be used in the gradient estimator. Either 'avg' (average reward, default), 'peters' (variance-minimizing) or 'zero' (no baseline) logger: for human-readable logs (standard output, csv, tensorboard...) shallow: whether to employ pre-computed score functions (only available for shallow policies) meta_conf: confidence level of safe-update test (for evaluation only) seed: random seed (None for random behavior) test_batchsize: number of test trajectories used to evaluate the corresponding deterministic policy at each iteration. If 0 or False, no test is performed info_key: name of the environment info to log save_params: how often (every x iterations) to save the policy parameters to disk. Final parameters are always saved for x>0. If False, they are never saved. log_params: whether to include policy parameters in the human-readable logs log_grad: whether to include gradients in the human-readable logs parallel: number of parallel jobs for simulation. If 0 or False, sequential simulation is performed. render: how often (every x iterations) to render the agent's behavior on a sample trajectory. If False, no rendering happens verbose: level of verbosity """ #Defaults if action_filter is None: action_filter = clip(env) if bound == 'chebyshev' and var_bound is None: raise NotImplementedError empirical_range = (grad_range is None) #Seed agent if seed is not None: seed_all_agent(seed) #Prepare logger algo_info = { 'Algorithm': 'AdaBatch', 'Estimator': estimator, 'Baseline': baseline, 'Env': str(env), 'Horizon': horizon, 'Discount': disc, 'Confidence': conf, 'ConfidenceParam': conf, 'Seed': seed, 'MinBatchSize': min_batchsize, 'MaxBatchSize': max_batchsize, 'PenalizationCoefficient': pen_coeff, 'VarianceBound': var_bound, 'Bound': bound } logger.write_info({**algo_info, **policy.info()}) log_keys = [ 'Perf', 'UPerf', 'AvgHorizon', 'StepSize', 'GradNorm', 'Time', 'StepSize', 'BatchSize', 'Info', 'TotSamples', 'GradVar', 'GradRange', 'Safety', 'Err', 'GradInfNorm' ] if log_params: log_keys += ['param%d' % i for i in range(policy.num_params())] if log_grad: log_keys += ['grad%d' % i for i in range(policy.num_params())] if test_batchsize: log_keys += ['TestPerf', 'TestPerf', 'TestInfo'] log_row = dict.fromkeys(log_keys) logger.open(log_row.keys()) #Initializations it = 0 tot_samples = 0 safety = 1. optimal_batchsize = min_batchsize _estimator = (reinforce_estimator if estimator == 'reinforce' else gpomdp_estimator) updated = False updates = 0 unsafe_updates = 0 params = policy.get_flat() max_grad = torch.zeros_like(params) - float('inf') min_grad = torch.zeros_like(params) + float('inf') #Learning loop while (it < iterations and tot_samples < max_samples): start = time.time() if verbose: print('\n* Iteration %d *' % it) params = policy.get_flat() #Test the corresponding deterministic policy if test_batchsize: test_batch = generate_batch(env, policy, horizon, episodes=test_batchsize, action_filter=action_filter, n_jobs=parallel, deterministic=True, key=info_key) log_row['TestPerf'] = performance(test_batch, disc) log_row['UTestPerf'] = performance(test_batch, 1) log_row['TestInfo'] = mean_sum_info(test_batch).item() #Render the agent's behavior if render and it % render == 0: generate_batch(env, policy, horizon, episodes=1, action_filter=action_filter, render=True) #Collect trajectories according to previous optimal batch size batch = generate_batch(env, policy, horizon, episodes=max( min_batchsize, min(max_batchsize, optimal_batchsize)), action_filter=action_filter, n_jobs=parallel, key=info_key) batchsize = len(batch) #Estimate policy gradient grad_samples = _estimator(batch, disc, policy, baselinekind=baseline, shallow=shallow, result='samples') grad = torch.mean(grad_samples, 0) grad_infnorm = torch.max(torch.abs(grad)) coordinate = torch.min(torch.argmax(torch.abs(grad))).item() #Compute statistics for estimation error if bound in ['bernstein', 'student']: grad_var = torch.var(grad_samples, 0, unbiased=True) grad_var = torch.max(grad_var).item() log_row['GradVar'] = grad_var else: log_row['GradVar'] = var_bound if bound in ['bernstein', 'hoeffding'] and empirical_range: max_grad = torch.max(grad, max_grad) min_grad = torch.min(min_grad, grad) grad_range = torch.max(max_grad - min_grad).item() if grad_range <= 0: grad_range = torch.max(2 * abs(max_grad)).item() log_row['GradRange'] = grad_range #Compute estimation error if bound == 'chebyshev': eps = math.sqrt(var_bound / conf) elif bound == 'student': quant = sts.t.ppf(1 - conf, batchsize) eps = quant * math.sqrt(grad_var) elif bound == 'hoeffding': eps = grad_range * math.sqrt(math.log(2. / conf) / 2) elif bound == 'bernstein': eps = math.sqrt(2 * grad_var * math.log(3. / conf)) eps2 = 3 * grad_range * math.log(3. / conf) #Compute optimal batch size if bound in ['chebyshev', 'student', 'hoeffding']: optimal_batchsize = math.ceil(((13 + 3 * math.sqrt(17)) * eps**2 / (2 * grad_infnorm**2)).item()) min_safe_batchsize = math.ceil((eps**2 / grad_infnorm**2).item()) else: min_safe_batchsize = math.ceil( ((eps + math.sqrt(eps**2 + 4 * eps2 * grad_infnorm)) / (2 * grad_infnorm))**2) optimal_batchsize = min_safe_batchsize _stepsize = ((grad_infnorm - eps / math.sqrt(optimal_batchsize) - eps2 / optimal_batchsize)**2 / (2 * pen_coeff * (grad_infnorm + eps / math.sqrt(optimal_batchsize) + eps2 / optimal_batchsize)**2)).item() ups = (grad_infnorm**2 * _stepsize * (1 - pen_coeff * _stepsize) / optimal_batchsize) old_ups = -float('inf') while ups > old_ups: optimal_batchsize += 1 old_ups = ups _stepsize = ( (grad_infnorm - eps / math.sqrt(optimal_batchsize) - eps2 / optimal_batchsize)**2 / (2 * pen_coeff * (grad_infnorm + eps / math.sqrt(optimal_batchsize) + eps2 / optimal_batchsize)**2)).item() ups = (grad_infnorm**2 * _stepsize * (1 - pen_coeff * _stepsize) / optimal_batchsize) optimal_batchsize -= 1 if verbose: print('Optimal batch size: %d' % optimal_batchsize) #Update long-term quantities tot_samples += batchsize #Update safety measure if updates == 0: old_rets = returns(batch, disc) elif updated: new_rets = returns(batch, disc) tscore, pval = sts.ttest_ind(old_rets, new_rets) if pval / 2 < meta_conf and tscore > 0: unsafe_updates += 1 if verbose: print('The previous update was unsafe! (p-value = %f)' % (pval / 2)) old_rets = new_rets safety = 1 - unsafe_updates / updates #Log log_row['Err'] = eps log_row['Safety'] = safety log_row['Perf'] = performance(batch, disc) log_row['Info'] = mean_sum_info(batch).item() log_row['UPerf'] = performance(batch, disc=1.) log_row['AvgHorizon'] = avg_horizon(batch) log_row['GradNorm'] = torch.norm(grad).item() log_row['GradInfNorm'] = grad_infnorm.item() log_row['BatchSize'] = batchsize log_row['TotSamples'] = tot_samples if log_params: for i in range(policy.num_params()): log_row['param%d' % i] = params[i].item() if log_grad: for i in range(policy.num_params()): log_row['grad%d' % i] = grad[i].item() #Check if number of samples is sufficient to perform update if grad_infnorm < eps / math.sqrt(batchsize): updated = False if verbose: print('No update, need more samples') #Log log_row['StepSize'] = 0. log_row['Time'] = time.time() - start if verbose: print(separator) logger.write_row(log_row, it) if verbose: print(separator) #Skip to next iteration (current trajectories are discarded) it += 1 continue #Select step size if bound == 'bernstein': stepsize = ((grad_infnorm - eps / math.sqrt(batchsize) - eps2 / batchsize)**2 / (2 * pen_coeff * (grad_infnorm + eps / math.sqrt(batchsize) + eps2 / batchsize)**2)).item() else: stepsize = (13 - 3 * math.sqrt(17)) / (4 * pen_coeff) log_row['StepSize'] = stepsize #Update policy parameters new_params = params new_params[coordinate] = (params[coordinate] + stepsize * grad[coordinate]) policy.set_from_flat(new_params) updated = True updates += 1 #Save parameters if save_params and it % save_params == 0: logger.save_params(params, it) #Next iteration log_row['Time'] = time.time() - start if verbose: print(separator) logger.write_row(log_row, it) if verbose: print(separator) it += 1 #Save final parameters if save_params: logger.save_params(params, it) #Cleanup logger.close()
def reinforce(env, policy, horizon, *, batchsize=100, iterations=1000, disc=0.99, stepper=ConstantStepper(1e-2), action_filter=None, estimator='gpomdp', baseline='avg', logger=Logger(name='gpomdp'), shallow=False, seed=None, test_batchsize=False, info_key='danger', save_params=100, log_params=False, log_grad=False, parallel=False, render=False, verbose=1): """ REINFORCE/G(PO)MDP algorithmn env: environment policy: the one to improve horizon: maximum task horizon batchsize: number of trajectories used to estimate policy gradient iterations: number of policy updates disc: discount factor stepper: step size criterion. A constant step size is used by default action_filter: function to apply to the agent's action before feeding it to the environment, not considered in gradient estimation. By default, the action is clipped to satisfy evironmental boundaries estimator: either 'reinforce' or 'gpomdp' (default). The latter typically suffers from less variance baseline: control variate to be used in the gradient estimator. Either 'avg' (average reward, default), 'peters' (variance-minimizing) or 'zero' (no baseline) logger: for human-readable logs (standard output, csv, tensorboard...) shallow: whether to employ pre-computed score functions (only available for shallow policies) seed: random seed (None for random behavior) test_batchsize: number of test trajectories used to evaluate the corresponding deterministic policy at each iteration. If 0 or False, no test is performed save_params: how often (every x iterations) to save the policy parameters to disk. Final parameters are always saved for x>0. If False, they are never saved. log_params: whether to include policy parameters in the human-readable logs log_grad: whether to include gradients in the human-readable logs parallel: number of parallel jobs for simulation. If 0 or False, sequential simulation is performed. render: how often (every x iterations) to render the agent's behavior on a sample trajectory. If False, no rendering happens verbose: level of verbosity (0: only logs; 1: normal; 2: maximum) """ #Defaults if action_filter is None: action_filter = clip(env) #Seed agent if seed is not None: seed_all_agent(seed) #Prepare logger algo_info = { 'Algorithm': 'REINFORCE', 'Estimator': estimator, 'Baseline': baseline, 'Env': str(env), 'Horizon': horizon, 'BatchSize': batchsize, 'Disc': disc, 'StepSizeCriterion': str(stepper), 'Seed': seed, } logger.write_info({**algo_info, **policy.info()}) log_keys = [ 'Perf', 'UPerf', 'AvgHorizon', 'StepSize', 'GradNorm', 'Time', 'StepSize', 'Exploration', 'Info' ] if log_params: log_keys += ['param%d' % i for i in range(policy.num_params())] if log_grad: log_keys += ['grad%d' % i for i in range(policy.num_params())] if test_batchsize: log_keys += ['TestPerf', 'TestPerf', 'TestInfo'] log_row = dict.fromkeys(log_keys) logger.open(log_row.keys()) #Learning loop it = 0 while (it < iterations): #Begin iteration start = time.time() if verbose: print('\nIteration ', it) params = policy.get_flat() if verbose > 1: print('Parameters:', params) #Test the corresponding deterministic policy if test_batchsize: test_batch = generate_batch(env, policy, horizon, test_batchsize, action_filter=action_filter, seed=seed, njobs=parallel, deterministic=True, key=info_key) log_row['TestPerf'] = performance(test_batch, disc) log_row['TestInfo'] = mean_sum_info(test_batch).item() log_row['UTestPerf'] = performance(test_batch, 1) #Render the agent's behavior if render and it % render == 0: generate_batch(env, policy, horizon, episodes=1, action_filter=action_filter, render=True, key=info_key) #Collect trajectories batch = generate_batch(env, policy, horizon, batchsize, action_filter=action_filter, seed=seed, n_jobs=parallel, key=info_key) log_row['Perf'] = performance(batch, disc) log_row['Info'] = mean_sum_info(batch).item() log_row['UPerf'] = performance(batch, disc=1.) log_row['AvgHorizon'] = avg_horizon(batch) log_row['Exploration'] = policy.exploration().item() #Estimate policy gradient if estimator == 'gpomdp': grad = gpomdp_estimator(batch, disc, policy, baselinekind=baseline, shallow=shallow) elif estimator == 'reinforce': grad = reinforce_estimator(batch, disc, policy, baselinekind=baseline, shallow=shallow) else: raise ValueError('Invalid policy gradient estimator') if verbose > 1: print('Gradients: ', grad) log_row['GradNorm'] = torch.norm(grad).item() #Select meta-parameters stepsize = stepper.next(grad) log_row['StepSize'] = torch.norm(torch.tensor(stepsize)).item() #Update policy parameters new_params = params + stepsize * grad policy.set_from_flat(new_params) #Log log_row['Time'] = time.time() - start if log_params: for i in range(policy.num_params()): log_row['param%d' % i] = params[i].item() if log_grad: for i in range(policy.num_params()): log_row['grad%d' % i] = grad[i].item() logger.write_row(log_row, it) #Save parameters if save_params and it % save_params == 0: logger.save_params(params, it) #Next iteration it += 1 #Save final parameters if save_params: logger.save_params(params, it) #Cleanup logger.close()
def adastep(env, policy, horizon, pen_coeff, var_bound, *, conf=0.2, batchsize=5000, iterations=float('inf'), max_samples=1e6, disc=0.9, action_filter=None, estimator='gpomdp', baseline='peters', logger=Logger(name='AdaStep'), shallow=True, meta_conf=0.05, seed=None, test_batchsize=False, info_key='danger', save_params=100, log_params=True, log_grad=False, parallel=False, render=False, verbose=1): """ Safe PG algorithm from "Adaptive Step Size for Policy Gradient Methods", Pirotta et al., 2013. Only for Gaussian policies. env: environment policy: the one to improve horizon: maximum task horizon pen_coeff: penalty coefficient for policy update var_bound: upper bound on the variance of the PG estimator conf: probability of failure batchsize: number of trajectories to estimate policy gradient iterations: maximum number of learning iterations max_samples: maximum number of total trajectories disc: discount factor action_filter: function to apply to the agent's action before feeding it to the environment, not considered in gradient estimation. By default, the action is clipped to satisfy evironmental boundaries estimator: either 'reinforce' or 'gpomdp' (default). The latter typically suffers from less variance baseline: control variate to be used in the gradient estimator. Either 'avg' (average reward, default), 'peters' (variance-minimizing) or 'zero' (no baseline) logger: for human-readable logs (standard output, csv, tensorboard...) shallow: whether to employ pre-computed score functions (only available for shallow policies) meta_conf: confidence level of safe-update test (for evaluation only) seed: random seed (None for random behavior) test_batchsize: number of test trajectories used to evaluate the corresponding deterministic policy at each iteration. If 0 or False, no test is performed info_key: name of the environment info to log save_params: how often (every x iterations) to save the policy parameters to disk. Final parameters are always saved for x>0. If False, they are never saved. log_params: whether to include policy parameters in the human-readable logs log_grad: whether to include gradients in the human-readable logs parallel: number of parallel jobs for simulation. If 0 or False, sequential simulation is performed. render: how often (every x iterations) to render the agent's behavior on a sample trajectory. If False, no rendering happens verbose: level of verbosity on standard output """ #Defaults if action_filter is None: action_filter = clip(env) #Seed agent if seed is not None: seed_all_agent(seed) #Prepare logger algo_info = { 'Algorithm': 'AdaStep', 'Estimator': estimator, 'Baseline': baseline, 'Env': str(env), 'Horizon': horizon, 'Discount': disc, 'Confidence': conf, 'ConfidenceParam': conf, 'Seed': seed, 'BatchSize': batchsize, 'PenalizationCoefficient': pen_coeff, 'VarianceBound': var_bound } logger.write_info({**algo_info, **policy.info()}) log_keys = [ 'Perf', 'UPerf', 'AvgHorizon', 'StepSize', 'GradNorm', 'Time', 'StepSize', 'BatchSize', 'Info', 'TotSamples', 'Safety' ] if log_params: log_keys += ['param%d' % i for i in range(policy.num_params())] if log_grad: log_keys += ['grad%d' % i for i in range(policy.num_params())] if test_batchsize: log_keys += ['TestPerf', 'TestPerf', 'TestInfo'] log_row = dict.fromkeys(log_keys) logger.open(log_row.keys()) #Initializations it = 0 tot_samples = 0 safety = 1. _estimator = (reinforce_estimator if estimator == 'reinforce' else gpomdp_estimator) updated = False updates = 0 unsafe_updates = 0 eps = math.sqrt(var_bound / conf) #Learning loop while (it < iterations and tot_samples < max_samples): start = time.time() if verbose: print('\n* Iteration %d *' % it) params = policy.get_flat() #Test the corresponding deterministic policy if test_batchsize: test_batch = generate_batch(env, policy, horizon, episodes=test_batchsize, action_filter=action_filter, n_jobs=parallel, deterministic=True, key=info_key) log_row['TestPerf'] = performance(test_batch, disc) log_row['UTestPerf'] = performance(test_batch, 1) log_row['TestInfo'] = mean_sum_info(test_batch).item() #Render the agent's behavior if render and it % render == 0: generate_batch(env, policy, horizon, episodes=1, action_filter=action_filter, render=True) #Collect trajectories according to fixed batch size batch = generate_batch(env, policy, horizon, episodes=batchsize, action_filter=action_filter, n_jobs=parallel, key=info_key) #Estimate policy gradient grad_samples = _estimator(batch, disc, policy, baselinekind=baseline, shallow=shallow, result='samples') grad = torch.mean(grad_samples, 0) lower = torch.clamp( torch.abs(grad) - eps / math.sqrt(batchsize), 0, float('inf')) upper = torch.abs(grad) + eps / math.sqrt(batchsize) #Update long-term quantities tot_samples += batchsize #Update safety measure if updates == 0: old_rets = returns(batch, disc) elif updated: new_rets = returns(batch, disc) tscore, pval = sts.ttest_ind(old_rets, new_rets) if pval / 2 < meta_conf and tscore > 0: unsafe_updates += 1 if verbose: print('The previous update was unsafe! (p-value = %f)' % (pval / 2)) old_rets = new_rets safety = 1 - unsafe_updates / updates #Log log_row['Safety'] = safety log_row['Perf'] = performance(batch, disc) log_row['Info'] = mean_sum_info(batch).item() log_row['UPerf'] = performance(batch, disc=1.) log_row['AvgHorizon'] = avg_horizon(batch) log_row['GradNorm'] = torch.norm(grad).item() log_row['BatchSize'] = batchsize log_row['TotSamples'] = tot_samples if log_params: for i in range(policy.num_params()): log_row['param%d' % i] = params[i].item() if log_grad: for i in range(policy.num_params()): log_row['grad%d' % i] = grad[i].item() #Check if number of samples is sufficient to perform update if torch.norm(lower) == 0: updated = False if verbose: print('No update, would require more samples') #Select step size stepsize = (torch.norm(lower)**2 / (2 * pen_coeff * torch.sum(upper)**2)).item() log_row['StepSize'] = stepsize #Update policy parameters new_params = params + stepsize * grad policy.set_from_flat(new_params) updated = True updates += 1 #Save parameters if save_params and it % save_params == 0: logger.save_params(params, it) #Next iteration log_row['Time'] = time.time() - start if verbose: print(separator) logger.write_row(log_row, it) if verbose: print(separator) it += 1 #Save final parameters if save_params: logger.save_params(params, it) #Cleanup logger.close()
import gym import torch import potion.envs from potion.actors.continuous_policies import ShallowGaussianPolicy from potion.algorithms.reinforce import reinforce from potion.common.logger import Logger from potion.meta.steppers import ConstantStepper log_dir = '../logs' log_name = 'REINFORCE' logger = Logger(directory=log_dir, name=log_name) env = gym.make('ContCartPole-v0') state_dim = sum( env.observation_space.shape) #dimensionality of the state space action_dim = sum(env.action_space.shape) #dimensionality of the action space print(state_dim, action_dim) horizon = 500 #maximum length of a trajectory gamma = 1. policy = ShallowGaussianPolicy( state_dim, #input size action_dim, #output size mu_init=torch.zeros(4), #initial mean parameters logstd_init=0., #log of standard deviation learn_std=False #We are NOT going to learn the variance parameter ) # state = torch.ones(4)
def mepg(env, policy, horizon, batchsize=500, iterations=200, disc=0.99, alpha=1e-1, eta=1e-3, clip_at=100, test_batchsize=False, render=False, seed=None, action_filter=None, parallel=False, logger=Logger(name='MEPG'), save_params=50, log_params=True, verbose=True): """ MEPG algorithm Only for shallow Gaussian policy w/ scalar variance """ #Defaults assert type(policy) == ShallowGaussianPolicy assert policy.learn_std if action_filter is None: action_filter = clip(env) #Seed agent if seed is not None: seed_all_agent(seed) #Prepare logger algo_info = { 'Algorithm': 'MEPG', 'Environment': str(env), 'BatchSize': batchsize, 'Horizon': horizon, 'Iterations': iterations, 'Disc': disc, 'Alpha': alpha, 'Eta': eta, 'Seed': seed, 'ActionFilter': action_filter } logger.write_info({**algo_info, **policy.info()}) log_keys = [ 'Perf', 'UPerf', 'AvgHorizon', 'StepSize', 'MetaStepSize', 'BatchSize', 'Exploration', 'OmegaGrad', 'OmegaMetagrad', 'upsilonGradNorm' ] if log_params: log_keys += ['param%d' % i for i in range(policy.num_params())] if test_batchsize: log_keys.append('DetPerf') log_row = dict.fromkeys(log_keys) logger.open(log_row.keys()) #Learning loop it = 0 while (it < iterations): #Begin iteration if verbose: print('\nIteration ', it) if verbose: print('Params: ', policy.get_flat()) #Test mean parameters on deterministic policy if test_batchsize: test_batch = generate_batch(env, policy, horizon, test_batchsize, action_filter=action_filter, seed=seed, njobs=parallel, deterministic=True) log_row['DetPerf'] = performance(test_batch, disc) #Render behavior if render: generate_batch(env, policy, horizon, 1, action_filter, render=True) #Set metaparameters omega = policy.get_scale_params() sigma = torch.exp(omega) stepsize = alpha * sigma**2 #Collect trajectories batch = generate_batch(env, policy, horizon, batchsize, action_filter=action_filter, seed=seed, n_jobs=parallel) #Estimate policy gradient grad = gpomdp_estimator(batch, disc, policy, baselinekind='peters', shallow=True) upsilon_grad = grad[1:] omega_grad = grad[0] omega_metagrad = metagrad(batch, disc, policy, alpha, clip_at, grad=grad) upsilon = policy.get_loc_params() new_upsilon = upsilon + stepsize * upsilon_grad policy.set_loc_params(new_upsilon) new_omega = omega + eta * omega_metagrad policy.set_scale_params(new_omega) # Log log_row['Exploration'] = policy.exploration() log_row['StepSize'] = stepsize.item() log_row['MetaStepSize'] = eta log_row['OmegaGrad'] = omega_grad.item() log_row['OmegaMetagrad'] = omega_metagrad.item() log_row['UpsilonGradNorm'] = torch.norm(upsilon_grad).item() log_row['BatchSize'] = batchsize log_row['Perf'] = performance(batch, disc) log_row['UPerf'] = performance(batch, 1.) log_row['AvgHorizon'] = avg_horizon(batch) params = policy.get_flat() if log_params: for i in range(policy.num_params()): log_row['param%d' % i] = params[i].item() logger.write_row(log_row, it) if save_params and it % save_params == 0: logger.save_params(params, it) # Next iteration it += 1 # Final policy if save_params: logger.save_params(params, it)
def ssepg(env, policy, horizon, batchsize=100, iterations=200, disc=0.99, pow_alpha=0.01, pow_err_tol=0.1, max_pow_it=100, max_pow_attempts=3, pow_clip=0.2, safety_req=MonotonicImprovement(0.), conf=0.2, adapt_batchsize=False, test_batchsize=False, render=False, seed=None, action_filter=None, parallel=False, logger=Logger(name='SEPG'), save_params=50, log_params=True, verbose=True): """ SSEPG algorithm Only for shallow Gaussian policy w/ scalar variance """ #Defaults assert type(policy) == ShallowGaussianPolicy assert policy.learn_std if action_filter is None: action_filter = clip(env) #Seed agent if seed is not None: seed_all_agent(seed) #Prepare logger algo_info = { 'Algorithm': 'MEPG', 'Environment': str(env), 'BatchSize': batchsize, 'Horizon': horizon, 'Iterations': iterations, 'Disc': disc, 'Seed': seed, 'ActionFilter': action_filter } logger.write_info({**algo_info, **policy.info()}) log_keys = [ 'Perf', 'UPerf', 'AvgHorizon', 'StepSize', 'MetaStepSize', 'BatchSize', 'Exploration', 'OmegaGrad', 'OmegaMetagrad', 'UpsilonGradNorm', 'UpsilonGradVar', 'UpsilonEps', 'OmegaGradVar', 'OmegaEps', 'Req', 'MinBatchSize', 'MaxReq', 'UpsilonLip', 'OmegaLip' ] if log_params: log_keys += ['param%d' % i for i in range(policy.num_params())] if test_batchsize: log_keys.append('DetPerf') log_row = dict.fromkeys(log_keys) logger.open(log_row.keys()) #Learning loop it = 0 stepsize = 0. metastepsize = 0. omega_grad_var = 0. omega_eps = 0. omega_metagrad = torch.zeros(1) G = 0. while (it < iterations): #Begin iteration if verbose: print('\nIteration ', it) if verbose: print('Params: ', policy.get_flat()) #Test mean parameters on deterministic policy if test_batchsize: test_batch = generate_batch(env, policy, horizon, test_batchsize, action_filter=action_filter, seed=seed, njobs=parallel, deterministic=True) log_row['DetPerf'] = performance(test_batch, disc) #Render behavior if render: generate_batch(env, policy, horizon, 1, action_filter, render=True) #Set metaparameters omega = policy.get_scale_params() sigma = torch.exp(omega) #Collect trajectories batch = generate_batch(env, policy, horizon, batchsize, action_filter=action_filter, seed=seed, n_jobs=parallel) perf = performance(batch, disc) #Estimate policy gradient grad_samples = gpomdp_estimator(batch, disc, policy, baselinekind='peters', shallow=True, result='samples') grad = torch.mean(grad_samples, 0) upsilon_grad = grad[1:] upsilon_grad_norm = torch.norm(upsilon_grad) omega_grad = grad[0] dfn = upsilon_grad.shape[0] ### Mean-update iteration if it % 2 == 0: #Compute gradient estimation error for mean parameters if conf < 1 and grad_samples.size()[1] > 2: centered = grad_samples[:, 1:] - upsilon_grad.unsqueeze(0) grad_cov = batchsize / (batchsize - 1) * torch.mean( torch.bmm(centered.unsqueeze(2), centered.unsqueeze(1)), 0) upsilon_grad_var = torch.sum(torch.diag(grad_cov)).item() max_eigv = eigsh(grad_cov.numpy(), 1)[0][0] quant = sts.f.ppf(1 - conf, dfn, batchsize - dfn) upsilon_eps = math.sqrt(max_eigv * dfn * quant) elif conf < 1: upsilon_grad_var = torch.var(grad_samples[:, 1]).item() quant = sts.t.ppf(1 - conf / 2, batchsize - 1) upsilon_eps = quant * math.sqrt(upsilon_grad_var) else: upsilon_eps = 0. upsilon_grad_var = 0. #Compute safe step size for mean parameters mask = torch.ones_like(grad) mask[0] = 0. F = power(policy, batch, grad, disc, pow_alpha=pow_alpha, err_tol=pow_err_tol, max_it=max_pow_it, max_attempts=max_pow_attempts, shallow=True, clip=pow_clip, verbose=verbose, mask=mask) req = safety_req.next(perf) max_req = (upsilon_grad_norm - upsilon_eps / math.sqrt((batchsize - dfn)))**2 / \ (2 * F) req = min(req, max_req) alpha = (upsilon_grad_norm - upsilon_eps / math.sqrt((batchsize - dfn))) / \ F * \ (1 + math.sqrt(1 - req / max_req)) stepsize = (alpha / upsilon_grad_norm).item() #Ensure minimum safe batchsize min_batchsize = math.ceil( (upsilon_eps**2 / upsilon_grad_norm**2).item() + 1e-12) + dfn if conf < 1 and adapt_batchsize: batchsize = max(batchsize, min_batchsize) #Update mean parameters upsilon = policy.get_loc_params() new_upsilon = upsilon + alpha * sigma**2 * upsilon_grad / upsilon_grad_norm policy.set_loc_params(new_upsilon) ### ### Variance-update iteration else: #Estimate meta gradient (alpha from previous step) omega_metagrad = metagrad(batch, disc, policy, alpha, grad_samples=grad_samples) omega_metagrad_norm = torch.norm(omega_metagrad) #Compute gradient estimation error for variance parameter if conf < 1: omega_grad_var = torch.var(grad_samples[:, 0]).item() quant = sts.t.ppf(1 - conf / 2, batchsize - 1) omega_eps = quant * math.sqrt(omega_grad_var / batchsize) else: omega_grad_var = 0. omega_eps = 0. #Compute safe meta step size mask = torch.zeros_like(grad) mask[0] = 1. G = power(policy, batch, grad, disc, pow_alpha=pow_alpha, err_tol=pow_err_tol, max_it=max_pow_it, max_attempts=max_pow_attempts, shallow=True, clip=pow_clip, verbose=verbose, mask=mask) req = safety_req.next(perf) proj = omega_grad.view(-1).dot( omega_metagrad.view(-1)) / torch.norm(omega_metagrad) max_req = (torch.abs(proj) - omega_eps / math.sqrt(batchsize))**2 / (2 * G) req = min(req, max_req) eta = (torch.abs(proj) - omega_eps / math.sqrt(batchsize)) / \ G * \ (torch.sign(proj) + torch.sqrt(1 - req / max_req)) metastepsize = (eta / omega_metagrad_norm).item() #Ensure minimum safe batchsize min_batchsize = math.ceil((omega_eps**2 / proj**2).item() + 1e-12) if conf < 1 and adapt_batchsize: batchsize = max(batchsize, min_batchsize) #Update variance parameters new_omega = omega + eta * omega_metagrad / omega_metagrad_norm policy.set_scale_params(new_omega) # Log log_row['UpsilonLip'] = F log_row['OmegaLip'] = G log_row['Exploration'] = sigma.item() log_row['StepSize'] = stepsize log_row['MetaStepSize'] = metastepsize log_row['OmegaGrad'] = omega_grad.item() log_row['OmegaMetagrad'] = omega_metagrad.item() log_row['UpsilonGradNorm'] = torch.norm(upsilon_grad).item() log_row['BatchSize'] = batchsize log_row['Perf'] = perf log_row['UPerf'] = performance(batch, 1.) log_row['AvgHorizon'] = avg_horizon(batch) log_row['UpsilonGradVar'] = upsilon_grad_var log_row['UpsilonEps'] = upsilon_eps log_row['OmegaGradVar'] = upsilon_grad_var log_row['OmegaEps'] = upsilon_eps log_row['Req'] = req print(min_batchsize) log_row['MinBatchSize'] = min_batchsize log_row['MaxReq'] = max_req.item() params = policy.get_flat() if log_params: for i in range(policy.num_params()): log_row['param%d' % i] = params[i].item() logger.write_row(log_row, it) if save_params and it % save_params == 0: logger.save_params(params, it) # Next iteration it += 1 # Final policy if save_params: logger.save_params(params, it)
def semisafepg(env, policy, horizon, *, conf = 0.05, min_batchsize = 32, max_batchsize = 5000, iterations = float('inf'), max_samples = 1e6, disc = 0.9, forget = 0.1, action_filter = None, estimator = 'gpomdp', baseline = 'peters', logger = Logger(name='SSPG'), shallow = True, pow_step = 0.01, pow_decay = 0.99, pow_it = 100, pow_tol = 0.05, pow_clip = 0.1, fast = False, meta_conf = 0.05, seed = None, test_batchsize = False, info_key = 'danger', save_params = 100, log_params = True, log_grad = False, parallel = False, render = False, verbose = 1): """ Semi-safe PG algorithm from "Smoothing Policies and Safe Policy Gradients, Papini et al., 2019 env: environment policy: the one to improve horizon: maximum task horizon conf: probability of unsafety (per update) min_batchsize: minimum number of trajectories used to estimate policy gradient max_batchsize: maximum number of trajectories used to estimate policy gradient iterations: maximum number of learning iterations max_samples: maximum number of total trajectories disc: discount factor forget: decay of the (estimated) global gradient Lipscthiz constant action_filter: function to apply to the agent's action before feeding it to the environment, not considered in gradient estimation. By default, the action is clipped to satisfy evironmental boundaries estimator: either 'reinforce' or 'gpomdp' (default). The latter typically suffers from less variance baseline: control variate to be used in the gradient estimator. Either 'avg' (average reward, default), 'peters' (variance-minimizing) or 'zero' (no baseline) logger: for human-readable logs (standard output, csv, tensorboard) shallow: whether to employ pre-computed score functions (only available for shallow policies) pow_step: step size of the power method pow_decay: initial decay parameter of the power method pow_it: maximum number of iterations (per epoch) of the power method pow_tol: relative-error tolerance of the power method pow_clip: importance-weight clipping parameter for the power method (default 0.2) fast: whether to pursue maximum convergence speed (under safety constraints) meta_conf: confidence level of safe update test (for evaluation) seed: random seed (None for random behavior) test_batchsize: number of test trajectories used to evaluate the corresponding deterministic policy at each iteration. If False, no test is performed info_key: name of the environment info to log save_params: how often (every x iterations) to save the policy parameters to disk. Final parameters are always saved for x>0. If False, they are never saved. log_params: whether to include policy parameters in the human-readable logs log_grad: whether to include gradients in the human-readable logs parallel: number of parallel jobs for simulation. If False, sequential simulation is performed. render: how often (every x iterations) to render the agent's behavior on a sample trajectory. If False, no rendering happens verbose: level of verbosity """ #Defaults if action_filter is None: action_filter = clip(env) #Seed agent if seed is not None: seed_all_agent(seed) #Prepare logger algo_info = {'Algorithm': 'SSPG', 'Estimator': estimator, 'Baseline': baseline, 'Env': str(env), 'Horizon': horizon, 'Discount': disc, 'Confidence': conf, 'ConfidenceParam': conf, 'Seed': seed, 'MinBatchSize': min_batchsize, 'MaxBatchSize': max_batchsize, 'ForgetParam': forget, 'PowerStep': pow_step, 'PowerDecay': pow_decay, 'PowerIters': pow_it, 'PowerTolerance': pow_tol, 'Fast': fast } logger.write_info({**algo_info, **policy.info()}) log_keys = ['Perf', 'UPerf', 'AvgHorizon', 'StepSize', 'GradNorm', 'Time', 'StepSize', 'BatchSize', 'LipConst', 'ErrBound', 'SampleVar', 'Info', 'TotSamples', 'Safety', 'UScore'] if log_params: log_keys += ['param%d' % i for i in range(policy.num_params())] if log_grad: log_keys += ['grad%d' % i for i in range(policy.num_params())] if test_batchsize: log_keys += ['TestPerf', 'TestPerf', 'TestInfo'] log_row = dict.fromkeys(log_keys) logger.open(log_row.keys()) #Initializations it = 0 updated = False updates = 0 unsafe_updates = 0 safety = 1. tot_samples = 0 optimal_batchsize = min_batchsize min_safe_batchsize = min_batchsize _conf = conf _estimator = (reinforce_estimator if estimator=='reinforce' else gpomdp_estimator) old_lip_const = 0. dfn = policy.get_flat().shape[0] min_batchsize = max(min_batchsize, dfn + 1) #Learning loop while(it < iterations and tot_samples < max_samples): start = time.time() if verbose: print('\n* Iteration %d *' % it) params = policy.get_flat() #Test the corresponding deterministic policy if test_batchsize: test_batch = generate_batch(env, policy, horizon, episodes=test_batchsize, action_filter=action_filter, n_jobs=parallel, deterministic=True, key=info_key) log_row['TestPerf'] = performance(test_batch, disc) log_row['UTestPerf'] = performance(test_batch, 1) log_row['TestInfo'] = mean_sum_info(test_batch).item() #Render the agent's behavior if render and it % render==0: generate_batch(env, policy, horizon, episodes=1, action_filter=action_filter, render=True) #Collect trajectories according to target batch size target_batchsize = min_safe_batchsize if fast else optimal_batchsize batch = generate_batch(env, policy, horizon, episodes=max(min_batchsize, min(max_batchsize, target_batchsize)), action_filter=action_filter, n_jobs=parallel, key=info_key) batchsize = len(batch) #Collect more trajectories to match minimum safe batch size do = True while do or batchsize < min_safe_batchsize: do = False batch += generate_batch(env, policy, horizon, episodes=(min(max_batchsize, min_safe_batchsize) - batchsize), action_filter=action_filter, n_jobs=parallel, key=info_key) batchsize = len(batch) #Estimate policy gradient grad_samples = _estimator(batch, disc, policy, baselinekind=baseline, shallow=shallow, result='samples') grad = torch.mean(grad_samples, 0) #Compute estimation error with ellipsoid confidence region centered = grad_samples - grad.unsqueeze(0) grad_cov = (batchsize/(batchsize - 1) * torch.mean(torch.bmm(centered.unsqueeze(2), centered.unsqueeze(1)),0)) grad_var = torch.sum(torch.diag(grad_cov)).item() #for humans max_eigv = eigsh(grad_cov.numpy(), 1)[0][0] quant = sts.f.ppf(1 - _conf, dfn, batchsize - dfn) eps = math.sqrt(max_eigv * dfn * quant) #Optimal batch size optimal_batchsize = torch.ceil(4 * eps**2 / (torch.norm(grad)**2) + dfn).item() min_safe_batchsize = torch.ceil(eps**2 / torch.norm(grad)**2 + dfn).item() target_batchsize = (min_safe_batchsize if fast else optimal_batchsize) if verbose and optimal_batchsize < max_batchsize: print('Collected %d / %d trajectories' % (batchsize, target_batchsize)) elif verbose: print('Collected %d / %d trajectories' % (batchsize, min(max_batchsize, target_batchsize))) #Adjust confidence before collecting more data for the same update if batchsize >= max_batchsize: break _conf /= 2 if verbose: print('Optimal batch size: %d' % (optimal_batchsize if optimal_batchsize < float('inf') else -1)) print('Minimum safe batch size: %d' % (min_safe_batchsize if min_safe_batchsize < float('inf') else -1)) if (batchsize >= min_safe_batchsize and batchsize < optimal_batchsize): print('Low sample regime') #Update safety measure if updates == 0: old_rets= returns(batch, disc) elif updated: new_rets = returns(batch, disc) tscore, pval = sts.ttest_ind(old_rets, new_rets) if pval / 2 < meta_conf and tscore > 0: unsafe_updates += 1 if verbose: print('The previous update was unsafe! (p-value = %f)' % (pval / 2)) old_rets = new_rets safety = 1 - unsafe_updates / updates #Update long-term quantities tot_samples += batchsize #Log log_row['SampleVar'] = grad_var log_row['UScore'] = torch.norm(grad).item() / math.sqrt(grad_var) log_row['Safety'] = safety log_row['ErrBound'] = eps log_row['Perf'] = performance(batch, disc) log_row['Info'] = mean_sum_info(batch).item() log_row['UPerf'] = performance(batch, disc=1.) log_row['AvgHorizon'] = avg_horizon(batch) log_row['GradNorm'] = torch.norm(grad).item() log_row['BatchSize'] = batchsize log_row['TotSamples'] = tot_samples if log_params: for i in range(policy.num_params()): log_row['param%d' % i] = params[i].item() if log_grad: for i in range(policy.num_params()): log_row['grad%d' % i] = grad[i].item() #Check if number of samples is sufficient to perform update if batchsize < min_safe_batchsize: updated = False if verbose: print('No update, would require more samples than allowed') #Log log_row['LipConst'] = old_lip_const log_row['StepSize'] = 0. log_row['Time'] = time.time() - start if verbose: print(separator) logger.write_row(log_row, it) if verbose: print(separator) #Adjust confidence before collecting new data for the same update _conf /= 2 #Skip to next iteration (current trajectories are discarded) it += 1 continue #Reset confidence for next update _conf = conf #Estimate gradient Lipschitz constant with off-policy Power Method lip_const = power(policy, batch, grad, disc, step=pow_step, decay_rate=pow_decay, tol=pow_tol, max_it=pow_it, estimator=_estimator, baseline=baseline, shallow=shallow, clip=pow_clip, verbose=verbose) #Update "global" lipschitz constant if it > 0: lip_const = (1 - forget) * max(lip_const, old_lip_const) + forget * lip_const old_lip_const = lip_const log_row['LipConst'] = lip_const #Select step size stepsize = 1. / lip_const * (1 - eps / (torch.norm(grad) * math.sqrt(batchsize - dfn)).item()) if fast: stepsize *= 2 log_row['StepSize'] = stepsize #Update policy parameters new_params = params + stepsize * grad policy.set_from_flat(new_params) updated = True updates += 1 #Save parameters if save_params and it % save_params == 0: logger.save_params(params, it) #Next iteration log_row['Time'] = time.time() - start if verbose: print(separator) logger.write_row(log_row, it) if verbose: print(separator) it += 1 #Save final parameters if save_params: logger.save_params(params, it) #Cleanup logger.close()