def reinforce2(alpha, logsig, env, policy, horizon, *, batchsize=100, iterations=1000, disc=0.99, stepper=ConstantStepper(1e-2), action_filter=None, estimator='gpomdp', baseline='avg', logger=Logger(name='gpomdp'), shallow=False, seed=None, test_batchsize=False, info_key='danger', save_params=100, log_params=False, log_grad=False, parallel=False, render=False, verbose=1): """ REINFORCE/G(PO)MDP algorithmn env: environment policy: the one to improve horizon: maximum task horizon batchsize: number of trajectories used to estimate policy gradient iterations: number of policy updates disc: discount factor stepper: step size criterion. A constant step size is used by default action_filter: function to apply to the agent's action before feeding it to the environment, not considered in gradient estimation. By default, the action is clipped to satisfy evironmental boundaries estimator: either 'reinforce' or 'gpomdp' (default). The latter typically suffers from less variance baseline: control variate to be used in the gradient estimator. Either 'avg' (average reward, default), 'peters' (variance-minimizing) or 'zero' (no baseline) logger: for human-readable logs (standard output, csv, tensorboard...) shallow: whether to employ pre-computed score functions (only available for shallow policies) seed: random seed (None for random behavior) test_batchsize: number of test trajectories used to evaluate the corresponding deterministic policy at each iteration. If 0 or False, no test is performed save_params: how often (every x iterations) to save the policy parameters to disk. Final parameters are always saved for x>0. If False, they are never saved. log_params: whether to include policy parameters in the human-readable logs log_grad: whether to include gradients in the human-readable logs parallel: number of parallel jobs for simulation. If 0 or False, sequential simulation is performed. render: how often (every x iterations) to render the agent's behavior on a sample trajectory. If False, no rendering happens verbose: level of verbosity (0: only logs; 1: normal; 2: maximum) """ # Defaults if action_filter is None: action_filter = clip(env) # Seed agent if seed is not None: seed_all_agent(seed) # Prepare logger algo_info = {'Algorithm': 'REINFORCE', 'Estimator': estimator, 'Baseline': baseline, 'Env': str(env), 'Horizon': horizon, 'BatchSize': batchsize, 'Disc': disc, 'StepSizeCriterion': str(stepper), 'Seed': seed, } logger.write_info({**algo_info, **policy.info()}) log_keys = ['Perf', 'UPerf', 'AvgHorizon', 'StepSize', 'GradNorm', 'Time', 'StepSize', 'Exploration', 'Info'] if log_params: log_keys += ['param%d' % i for i in range(policy.num_params())] if log_grad: log_keys += ['grad%d' % i for i in range(policy.num_params())] if test_batchsize: log_keys += ['TestPerf', 'TestPerf', 'TestInfo'] log_row = dict.fromkeys(log_keys) logger.open(log_row.keys()) # init image & csv filename = "../csv/minigolf/REINFORCE/ALPHA={}/LOGSTD={}/data{}.csv".format(alpha, logsig, seed) os.makedirs(os.path.dirname(filename), exist_ok=True) data_file = open(filename, mode='w') file_writer = csv.writer(data_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) visualizer = MGVisualizer("MG visualizer", "/minigolf/REINFORCE/ALPHA={}/LOGSTD={}/test{}.png".format(alpha, logsig, seed)) visualizer.clean_panels() # PLOTTER INFO stats = {} stats['w1'] = [] stats['w2'] = [] stats['w3'] = [] stats['w4'] = [] stats['j'] = [] stats['fail'] = [] # ------------ # Learning loop it = 0 cumulative_fail = 0 cumulative_j = 0 while it < iterations: # Begin iteration start = time.time() if verbose: print('\nIteration ', it) params = policy.get_flat() if verbose > 1: print('Parameters:', params) # Test the corresponding deterministic policy if test_batchsize: test_batch = generate_batch(env, policy, horizon, test_batchsize, action_filter=action_filter, seed=seed, njobs=parallel, deterministic=True, key=info_key) log_row['TestPerf'] = performance(test_batch, disc) log_row['TestInfo'] = mean_sum_info(test_batch).item() log_row['UTestPerf'] = performance(test_batch, 1) # Render the agent's behavior if render and it % render == 0: generate_batch(env, policy, horizon, episodes=1, action_filter=action_filter, render=True, key=info_key) # Collect trajectories batch = generate_batch(env, policy, horizon, batchsize, action_filter=action_filter, seed=seed, n_jobs=parallel, key=info_key) # ------------------- count fails ------------------- rewards = [b[2] for b in batch] failures = [np.count_nonzero(r==-100) for r in rewards] cumulative_fail += sum(failures) # --------------------------------------------------- perf = performance(batch, disc) cumulative_j += perf log_row['Perf'] = perf log_row['Info'] = mean_sum_info(batch).item() log_row['UPerf'] = performance(batch, disc=1.) log_row['AvgHorizon'] = avg_horizon(batch) log_row['Exploration'] = policy.exploration().item() log_row['IterationFails'] = sum(failures) log_row['CumulativeFails'] = cumulative_fail # Estimate policy gradient if estimator == 'gpomdp': grad = gpomdp_estimator(batch, disc, policy, baselinekind=baseline, shallow=shallow) elif estimator == 'reinforce': grad = reinforce_estimator(batch, disc, policy, baselinekind=baseline, shallow=shallow) else: raise ValueError('Invalid policy gradient estimator') if verbose > 1: print('Gradients: ', grad) log_row['GradNorm'] = torch.norm(grad).item() # Select meta-parameters stepsize = stepper.next(grad) log_row['StepSize'] = torch.norm(torch.tensor(stepsize)).item() # Update policy parameters new_params = params + stepsize * grad policy.set_from_flat(new_params) # Log log_row['Time'] = time.time() - start if log_params: for i in range(policy.num_params()): log_row['param%d' % i] = params[i].item() if log_grad: for i in range(policy.num_params()): log_row['grad%d' % i] = grad[i].item() logger.write_row(log_row, it) # Save parameters if save_params and it % save_params == 0: logger.save_params(params, it) print(new_params) params = new_params.numpy()[1:] # updated w # update csv & image visualizer.show_values(params, perf, cumulative_fail) file_writer.writerow([params[0], params[1], params[2], params[3], cumulative_fail, perf]) # PLOTTER INFO # if it % 10 == 0: stats['w1'].append(params[0]) stats['w2'].append(params[1]) stats['w3'].append(params[2]) stats['w4'].append(params[3]) stats['j'].append(perf) stats['fail'].append(cumulative_fail) # ------------ # Next iteration it += 1 # Save final parameters if save_params: logger.save_params(params, it) visualizer.save_image() # Cleanup logger.close() return stats, cumulative_j
def reinforce(env, policy, horizon, *, batchsize=100, iterations=1000, disc=0.99, stepper=ConstantStepper(1e-2), action_filter=None, estimator='gpomdp', baseline='avg', logger=Logger(name='gpomdp'), shallow=False, seed=None, test_batchsize=False, info_key='danger', save_params=100, log_params=False, log_grad=False, parallel=False, render=False, verbose=1): """ REINFORCE/G(PO)MDP algorithmn env: environment policy: the one to improve horizon: maximum task horizon batchsize: number of trajectories used to estimate policy gradient iterations: number of policy updates disc: discount factor stepper: step size criterion. A constant step size is used by default action_filter: function to apply to the agent's action before feeding it to the environment, not considered in gradient estimation. By default, the action is clipped to satisfy evironmental boundaries estimator: either 'reinforce' or 'gpomdp' (default). The latter typically suffers from less variance baseline: control variate to be used in the gradient estimator. Either 'avg' (average reward, default), 'peters' (variance-minimizing) or 'zero' (no baseline) logger: for human-readable logs (standard output, csv, tensorboard...) shallow: whether to employ pre-computed score functions (only available for shallow policies) seed: random seed (None for random behavior) test_batchsize: number of test trajectories used to evaluate the corresponding deterministic policy at each iteration. If 0 or False, no test is performed save_params: how often (every x iterations) to save the policy parameters to disk. Final parameters are always saved for x>0. If False, they are never saved. log_params: whether to include policy parameters in the human-readable logs log_grad: whether to include gradients in the human-readable logs parallel: number of parallel jobs for simulation. If 0 or False, sequential simulation is performed. render: how often (every x iterations) to render the agent's behavior on a sample trajectory. If False, no rendering happens verbose: level of verbosity (0: only logs; 1: normal; 2: maximum) """ #Defaults if action_filter is None: action_filter = clip(env) #Seed agent if seed is not None: seed_all_agent(seed) #Prepare logger algo_info = { 'Algorithm': 'REINFORCE', 'Estimator': estimator, 'Baseline': baseline, 'Env': str(env), 'Horizon': horizon, 'BatchSize': batchsize, 'Disc': disc, 'StepSizeCriterion': str(stepper), 'Seed': seed, } logger.write_info({**algo_info, **policy.info()}) log_keys = [ 'Perf', 'UPerf', 'AvgHorizon', 'StepSize', 'GradNorm', 'Time', 'StepSize', 'Exploration', 'Info' ] if log_params: log_keys += ['param%d' % i for i in range(policy.num_params())] if log_grad: log_keys += ['grad%d' % i for i in range(policy.num_params())] if test_batchsize: log_keys += ['TestPerf', 'TestPerf', 'TestInfo'] log_row = dict.fromkeys(log_keys) logger.open(log_row.keys()) #Learning loop it = 0 while (it < iterations): #Begin iteration start = time.time() if verbose: print('\nIteration ', it) params = policy.get_flat() if verbose > 1: print('Parameters:', params) #Test the corresponding deterministic policy if test_batchsize: test_batch = generate_batch(env, policy, horizon, test_batchsize, action_filter=action_filter, seed=seed, njobs=parallel, deterministic=True, key=info_key) log_row['TestPerf'] = performance(test_batch, disc) log_row['TestInfo'] = mean_sum_info(test_batch).item() log_row['UTestPerf'] = performance(test_batch, 1) #Render the agent's behavior if render and it % render == 0: generate_batch(env, policy, horizon, episodes=1, action_filter=action_filter, render=True, key=info_key) #Collect trajectories batch = generate_batch(env, policy, horizon, batchsize, action_filter=action_filter, seed=seed, n_jobs=parallel, key=info_key) log_row['Perf'] = performance(batch, disc) log_row['Info'] = mean_sum_info(batch).item() log_row['UPerf'] = performance(batch, disc=1.) log_row['AvgHorizon'] = avg_horizon(batch) log_row['Exploration'] = policy.exploration().item() #Estimate policy gradient if estimator == 'gpomdp': grad = gpomdp_estimator(batch, disc, policy, baselinekind=baseline, shallow=shallow) elif estimator == 'reinforce': grad = reinforce_estimator(batch, disc, policy, baselinekind=baseline, shallow=shallow) else: raise ValueError('Invalid policy gradient estimator') if verbose > 1: print('Gradients: ', grad) log_row['GradNorm'] = torch.norm(grad).item() #Select meta-parameters stepsize = stepper.next(grad) log_row['StepSize'] = torch.norm(torch.tensor(stepsize)).item() #Update policy parameters new_params = params + stepsize * grad policy.set_from_flat(new_params) #Log log_row['Time'] = time.time() - start if log_params: for i in range(policy.num_params()): log_row['param%d' % i] = params[i].item() if log_grad: for i in range(policy.num_params()): log_row['grad%d' % i] = grad[i].item() logger.write_row(log_row, it) #Save parameters if save_params and it % save_params == 0: logger.save_params(params, it) #Next iteration it += 1 #Save final parameters if save_params: logger.save_params(params, it) #Cleanup logger.close()
def adastep(env, policy, horizon, pen_coeff, var_bound, *, conf=0.2, batchsize=5000, iterations=float('inf'), max_samples=1e6, disc=0.9, action_filter=None, estimator='gpomdp', baseline='peters', logger=Logger(name='AdaStep'), shallow=True, meta_conf=0.05, seed=None, test_batchsize=False, info_key='danger', save_params=100, log_params=True, log_grad=False, parallel=False, render=False, verbose=1): """ Safe PG algorithm from "Adaptive Step Size for Policy Gradient Methods", Pirotta et al., 2013. Only for Gaussian policies. env: environment policy: the one to improve horizon: maximum task horizon pen_coeff: penalty coefficient for policy update var_bound: upper bound on the variance of the PG estimator conf: probability of failure batchsize: number of trajectories to estimate policy gradient iterations: maximum number of learning iterations max_samples: maximum number of total trajectories disc: discount factor action_filter: function to apply to the agent's action before feeding it to the environment, not considered in gradient estimation. By default, the action is clipped to satisfy evironmental boundaries estimator: either 'reinforce' or 'gpomdp' (default). The latter typically suffers from less variance baseline: control variate to be used in the gradient estimator. Either 'avg' (average reward, default), 'peters' (variance-minimizing) or 'zero' (no baseline) logger: for human-readable logs (standard output, csv, tensorboard...) shallow: whether to employ pre-computed score functions (only available for shallow policies) meta_conf: confidence level of safe-update test (for evaluation only) seed: random seed (None for random behavior) test_batchsize: number of test trajectories used to evaluate the corresponding deterministic policy at each iteration. If 0 or False, no test is performed info_key: name of the environment info to log save_params: how often (every x iterations) to save the policy parameters to disk. Final parameters are always saved for x>0. If False, they are never saved. log_params: whether to include policy parameters in the human-readable logs log_grad: whether to include gradients in the human-readable logs parallel: number of parallel jobs for simulation. If 0 or False, sequential simulation is performed. render: how often (every x iterations) to render the agent's behavior on a sample trajectory. If False, no rendering happens verbose: level of verbosity on standard output """ #Defaults if action_filter is None: action_filter = clip(env) #Seed agent if seed is not None: seed_all_agent(seed) #Prepare logger algo_info = { 'Algorithm': 'AdaStep', 'Estimator': estimator, 'Baseline': baseline, 'Env': str(env), 'Horizon': horizon, 'Discount': disc, 'Confidence': conf, 'ConfidenceParam': conf, 'Seed': seed, 'BatchSize': batchsize, 'PenalizationCoefficient': pen_coeff, 'VarianceBound': var_bound } logger.write_info({**algo_info, **policy.info()}) log_keys = [ 'Perf', 'UPerf', 'AvgHorizon', 'StepSize', 'GradNorm', 'Time', 'StepSize', 'BatchSize', 'Info', 'TotSamples', 'Safety' ] if log_params: log_keys += ['param%d' % i for i in range(policy.num_params())] if log_grad: log_keys += ['grad%d' % i for i in range(policy.num_params())] if test_batchsize: log_keys += ['TestPerf', 'TestPerf', 'TestInfo'] log_row = dict.fromkeys(log_keys) logger.open(log_row.keys()) #Initializations it = 0 tot_samples = 0 safety = 1. _estimator = (reinforce_estimator if estimator == 'reinforce' else gpomdp_estimator) updated = False updates = 0 unsafe_updates = 0 eps = math.sqrt(var_bound / conf) #Learning loop while (it < iterations and tot_samples < max_samples): start = time.time() if verbose: print('\n* Iteration %d *' % it) params = policy.get_flat() #Test the corresponding deterministic policy if test_batchsize: test_batch = generate_batch(env, policy, horizon, episodes=test_batchsize, action_filter=action_filter, n_jobs=parallel, deterministic=True, key=info_key) log_row['TestPerf'] = performance(test_batch, disc) log_row['UTestPerf'] = performance(test_batch, 1) log_row['TestInfo'] = mean_sum_info(test_batch).item() #Render the agent's behavior if render and it % render == 0: generate_batch(env, policy, horizon, episodes=1, action_filter=action_filter, render=True) #Collect trajectories according to fixed batch size batch = generate_batch(env, policy, horizon, episodes=batchsize, action_filter=action_filter, n_jobs=parallel, key=info_key) #Estimate policy gradient grad_samples = _estimator(batch, disc, policy, baselinekind=baseline, shallow=shallow, result='samples') grad = torch.mean(grad_samples, 0) lower = torch.clamp( torch.abs(grad) - eps / math.sqrt(batchsize), 0, float('inf')) upper = torch.abs(grad) + eps / math.sqrt(batchsize) #Update long-term quantities tot_samples += batchsize #Update safety measure if updates == 0: old_rets = returns(batch, disc) elif updated: new_rets = returns(batch, disc) tscore, pval = sts.ttest_ind(old_rets, new_rets) if pval / 2 < meta_conf and tscore > 0: unsafe_updates += 1 if verbose: print('The previous update was unsafe! (p-value = %f)' % (pval / 2)) old_rets = new_rets safety = 1 - unsafe_updates / updates #Log log_row['Safety'] = safety log_row['Perf'] = performance(batch, disc) log_row['Info'] = mean_sum_info(batch).item() log_row['UPerf'] = performance(batch, disc=1.) log_row['AvgHorizon'] = avg_horizon(batch) log_row['GradNorm'] = torch.norm(grad).item() log_row['BatchSize'] = batchsize log_row['TotSamples'] = tot_samples if log_params: for i in range(policy.num_params()): log_row['param%d' % i] = params[i].item() if log_grad: for i in range(policy.num_params()): log_row['grad%d' % i] = grad[i].item() #Check if number of samples is sufficient to perform update if torch.norm(lower) == 0: updated = False if verbose: print('No update, would require more samples') #Select step size stepsize = (torch.norm(lower)**2 / (2 * pen_coeff * torch.sum(upper)**2)).item() log_row['StepSize'] = stepsize #Update policy parameters new_params = params + stepsize * grad policy.set_from_flat(new_params) updated = True updates += 1 #Save parameters if save_params and it % save_params == 0: logger.save_params(params, it) #Next iteration log_row['Time'] = time.time() - start if verbose: print(separator) logger.write_row(log_row, it) if verbose: print(separator) it += 1 #Save final parameters if save_params: logger.save_params(params, it) #Cleanup logger.close()
def adabatch(env, policy, horizon, pen_coeff, *, bound='chebyshev', var_bound=None, grad_range=None, conf=0.2, min_batchsize=32, max_batchsize=10000, iterations=float('inf'), max_samples=1e6, disc=0.9, action_filter=None, estimator='gpomdp', baseline='peters', logger=Logger(name='AdaBatch'), shallow=True, meta_conf=0.05, seed=None, test_batchsize=False, info_key='danger', save_params=100, log_params=True, log_grad=False, parallel=False, render=False, verbose=1): """ Safe PG algorithm from "Adaptive Batch Size for Safe Policy Gradients", Papini et al., 2017. Only for Gaussian policies. env: environment policy: the one to improve horizon: maximum task horizon pen_coeff: penalty coefficient for policy update bound: statistical inequality used to determine optimal batchsize (chebyshev/student/hoeffding/bernstein) var_bound: upper bound on the variance of the PG estimator. Must not be None if Chebyshev's bound is employed grad_range: theoretical range of gradient estimate. If none, it is estimated from data (in a biased way) conf: probability of failure min_batchsize: minimum number of trajectories to estimate policy gradient max_batchsize: maximum number of trajectories to estimate policy gradient iterations: number of policy updates max_samples: maximum number of total trajectories disc: discount factor action_filter: function to apply to the agent's action before feeding it to the environment, not considered in gradient estimation. By default, the action is clipped to satisfy evironmental boundaries estimator: either 'reinforce' or 'gpomdp' (default). The latter typically suffers from less variance baseline: control variate to be used in the gradient estimator. Either 'avg' (average reward, default), 'peters' (variance-minimizing) or 'zero' (no baseline) logger: for human-readable logs (standard output, csv, tensorboard...) shallow: whether to employ pre-computed score functions (only available for shallow policies) meta_conf: confidence level of safe-update test (for evaluation only) seed: random seed (None for random behavior) test_batchsize: number of test trajectories used to evaluate the corresponding deterministic policy at each iteration. If 0 or False, no test is performed info_key: name of the environment info to log save_params: how often (every x iterations) to save the policy parameters to disk. Final parameters are always saved for x>0. If False, they are never saved. log_params: whether to include policy parameters in the human-readable logs log_grad: whether to include gradients in the human-readable logs parallel: number of parallel jobs for simulation. If 0 or False, sequential simulation is performed. render: how often (every x iterations) to render the agent's behavior on a sample trajectory. If False, no rendering happens verbose: level of verbosity """ #Defaults if action_filter is None: action_filter = clip(env) if bound == 'chebyshev' and var_bound is None: raise NotImplementedError empirical_range = (grad_range is None) #Seed agent if seed is not None: seed_all_agent(seed) #Prepare logger algo_info = { 'Algorithm': 'AdaBatch', 'Estimator': estimator, 'Baseline': baseline, 'Env': str(env), 'Horizon': horizon, 'Discount': disc, 'Confidence': conf, 'ConfidenceParam': conf, 'Seed': seed, 'MinBatchSize': min_batchsize, 'MaxBatchSize': max_batchsize, 'PenalizationCoefficient': pen_coeff, 'VarianceBound': var_bound, 'Bound': bound } logger.write_info({**algo_info, **policy.info()}) log_keys = [ 'Perf', 'UPerf', 'AvgHorizon', 'StepSize', 'GradNorm', 'Time', 'StepSize', 'BatchSize', 'Info', 'TotSamples', 'GradVar', 'GradRange', 'Safety', 'Err', 'GradInfNorm' ] if log_params: log_keys += ['param%d' % i for i in range(policy.num_params())] if log_grad: log_keys += ['grad%d' % i for i in range(policy.num_params())] if test_batchsize: log_keys += ['TestPerf', 'TestPerf', 'TestInfo'] log_row = dict.fromkeys(log_keys) logger.open(log_row.keys()) #Initializations it = 0 tot_samples = 0 safety = 1. optimal_batchsize = min_batchsize _estimator = (reinforce_estimator if estimator == 'reinforce' else gpomdp_estimator) updated = False updates = 0 unsafe_updates = 0 params = policy.get_flat() max_grad = torch.zeros_like(params) - float('inf') min_grad = torch.zeros_like(params) + float('inf') #Learning loop while (it < iterations and tot_samples < max_samples): start = time.time() if verbose: print('\n* Iteration %d *' % it) params = policy.get_flat() #Test the corresponding deterministic policy if test_batchsize: test_batch = generate_batch(env, policy, horizon, episodes=test_batchsize, action_filter=action_filter, n_jobs=parallel, deterministic=True, key=info_key) log_row['TestPerf'] = performance(test_batch, disc) log_row['UTestPerf'] = performance(test_batch, 1) log_row['TestInfo'] = mean_sum_info(test_batch).item() #Render the agent's behavior if render and it % render == 0: generate_batch(env, policy, horizon, episodes=1, action_filter=action_filter, render=True) #Collect trajectories according to previous optimal batch size batch = generate_batch(env, policy, horizon, episodes=max( min_batchsize, min(max_batchsize, optimal_batchsize)), action_filter=action_filter, n_jobs=parallel, key=info_key) batchsize = len(batch) #Estimate policy gradient grad_samples = _estimator(batch, disc, policy, baselinekind=baseline, shallow=shallow, result='samples') grad = torch.mean(grad_samples, 0) grad_infnorm = torch.max(torch.abs(grad)) coordinate = torch.min(torch.argmax(torch.abs(grad))).item() #Compute statistics for estimation error if bound in ['bernstein', 'student']: grad_var = torch.var(grad_samples, 0, unbiased=True) grad_var = torch.max(grad_var).item() log_row['GradVar'] = grad_var else: log_row['GradVar'] = var_bound if bound in ['bernstein', 'hoeffding'] and empirical_range: max_grad = torch.max(grad, max_grad) min_grad = torch.min(min_grad, grad) grad_range = torch.max(max_grad - min_grad).item() if grad_range <= 0: grad_range = torch.max(2 * abs(max_grad)).item() log_row['GradRange'] = grad_range #Compute estimation error if bound == 'chebyshev': eps = math.sqrt(var_bound / conf) elif bound == 'student': quant = sts.t.ppf(1 - conf, batchsize) eps = quant * math.sqrt(grad_var) elif bound == 'hoeffding': eps = grad_range * math.sqrt(math.log(2. / conf) / 2) elif bound == 'bernstein': eps = math.sqrt(2 * grad_var * math.log(3. / conf)) eps2 = 3 * grad_range * math.log(3. / conf) #Compute optimal batch size if bound in ['chebyshev', 'student', 'hoeffding']: optimal_batchsize = math.ceil(((13 + 3 * math.sqrt(17)) * eps**2 / (2 * grad_infnorm**2)).item()) min_safe_batchsize = math.ceil((eps**2 / grad_infnorm**2).item()) else: min_safe_batchsize = math.ceil( ((eps + math.sqrt(eps**2 + 4 * eps2 * grad_infnorm)) / (2 * grad_infnorm))**2) optimal_batchsize = min_safe_batchsize _stepsize = ((grad_infnorm - eps / math.sqrt(optimal_batchsize) - eps2 / optimal_batchsize)**2 / (2 * pen_coeff * (grad_infnorm + eps / math.sqrt(optimal_batchsize) + eps2 / optimal_batchsize)**2)).item() ups = (grad_infnorm**2 * _stepsize * (1 - pen_coeff * _stepsize) / optimal_batchsize) old_ups = -float('inf') while ups > old_ups: optimal_batchsize += 1 old_ups = ups _stepsize = ( (grad_infnorm - eps / math.sqrt(optimal_batchsize) - eps2 / optimal_batchsize)**2 / (2 * pen_coeff * (grad_infnorm + eps / math.sqrt(optimal_batchsize) + eps2 / optimal_batchsize)**2)).item() ups = (grad_infnorm**2 * _stepsize * (1 - pen_coeff * _stepsize) / optimal_batchsize) optimal_batchsize -= 1 if verbose: print('Optimal batch size: %d' % optimal_batchsize) #Update long-term quantities tot_samples += batchsize #Update safety measure if updates == 0: old_rets = returns(batch, disc) elif updated: new_rets = returns(batch, disc) tscore, pval = sts.ttest_ind(old_rets, new_rets) if pval / 2 < meta_conf and tscore > 0: unsafe_updates += 1 if verbose: print('The previous update was unsafe! (p-value = %f)' % (pval / 2)) old_rets = new_rets safety = 1 - unsafe_updates / updates #Log log_row['Err'] = eps log_row['Safety'] = safety log_row['Perf'] = performance(batch, disc) log_row['Info'] = mean_sum_info(batch).item() log_row['UPerf'] = performance(batch, disc=1.) log_row['AvgHorizon'] = avg_horizon(batch) log_row['GradNorm'] = torch.norm(grad).item() log_row['GradInfNorm'] = grad_infnorm.item() log_row['BatchSize'] = batchsize log_row['TotSamples'] = tot_samples if log_params: for i in range(policy.num_params()): log_row['param%d' % i] = params[i].item() if log_grad: for i in range(policy.num_params()): log_row['grad%d' % i] = grad[i].item() #Check if number of samples is sufficient to perform update if grad_infnorm < eps / math.sqrt(batchsize): updated = False if verbose: print('No update, need more samples') #Log log_row['StepSize'] = 0. log_row['Time'] = time.time() - start if verbose: print(separator) logger.write_row(log_row, it) if verbose: print(separator) #Skip to next iteration (current trajectories are discarded) it += 1 continue #Select step size if bound == 'bernstein': stepsize = ((grad_infnorm - eps / math.sqrt(batchsize) - eps2 / batchsize)**2 / (2 * pen_coeff * (grad_infnorm + eps / math.sqrt(batchsize) + eps2 / batchsize)**2)).item() else: stepsize = (13 - 3 * math.sqrt(17)) / (4 * pen_coeff) log_row['StepSize'] = stepsize #Update policy parameters new_params = params new_params[coordinate] = (params[coordinate] + stepsize * grad[coordinate]) policy.set_from_flat(new_params) updated = True updates += 1 #Save parameters if save_params and it % save_params == 0: logger.save_params(params, it) #Next iteration log_row['Time'] = time.time() - start if verbose: print(separator) logger.write_row(log_row, it) if verbose: print(separator) it += 1 #Save final parameters if save_params: logger.save_params(params, it) #Cleanup logger.close()
def semisafepg(env, policy, horizon, *, conf = 0.05, min_batchsize = 32, max_batchsize = 5000, iterations = float('inf'), max_samples = 1e6, disc = 0.9, forget = 0.1, action_filter = None, estimator = 'gpomdp', baseline = 'peters', logger = Logger(name='SSPG'), shallow = True, pow_step = 0.01, pow_decay = 0.99, pow_it = 100, pow_tol = 0.05, pow_clip = 0.1, fast = False, meta_conf = 0.05, seed = None, test_batchsize = False, info_key = 'danger', save_params = 100, log_params = True, log_grad = False, parallel = False, render = False, verbose = 1): """ Semi-safe PG algorithm from "Smoothing Policies and Safe Policy Gradients, Papini et al., 2019 env: environment policy: the one to improve horizon: maximum task horizon conf: probability of unsafety (per update) min_batchsize: minimum number of trajectories used to estimate policy gradient max_batchsize: maximum number of trajectories used to estimate policy gradient iterations: maximum number of learning iterations max_samples: maximum number of total trajectories disc: discount factor forget: decay of the (estimated) global gradient Lipscthiz constant action_filter: function to apply to the agent's action before feeding it to the environment, not considered in gradient estimation. By default, the action is clipped to satisfy evironmental boundaries estimator: either 'reinforce' or 'gpomdp' (default). The latter typically suffers from less variance baseline: control variate to be used in the gradient estimator. Either 'avg' (average reward, default), 'peters' (variance-minimizing) or 'zero' (no baseline) logger: for human-readable logs (standard output, csv, tensorboard) shallow: whether to employ pre-computed score functions (only available for shallow policies) pow_step: step size of the power method pow_decay: initial decay parameter of the power method pow_it: maximum number of iterations (per epoch) of the power method pow_tol: relative-error tolerance of the power method pow_clip: importance-weight clipping parameter for the power method (default 0.2) fast: whether to pursue maximum convergence speed (under safety constraints) meta_conf: confidence level of safe update test (for evaluation) seed: random seed (None for random behavior) test_batchsize: number of test trajectories used to evaluate the corresponding deterministic policy at each iteration. If False, no test is performed info_key: name of the environment info to log save_params: how often (every x iterations) to save the policy parameters to disk. Final parameters are always saved for x>0. If False, they are never saved. log_params: whether to include policy parameters in the human-readable logs log_grad: whether to include gradients in the human-readable logs parallel: number of parallel jobs for simulation. If False, sequential simulation is performed. render: how often (every x iterations) to render the agent's behavior on a sample trajectory. If False, no rendering happens verbose: level of verbosity """ #Defaults if action_filter is None: action_filter = clip(env) #Seed agent if seed is not None: seed_all_agent(seed) #Prepare logger algo_info = {'Algorithm': 'SSPG', 'Estimator': estimator, 'Baseline': baseline, 'Env': str(env), 'Horizon': horizon, 'Discount': disc, 'Confidence': conf, 'ConfidenceParam': conf, 'Seed': seed, 'MinBatchSize': min_batchsize, 'MaxBatchSize': max_batchsize, 'ForgetParam': forget, 'PowerStep': pow_step, 'PowerDecay': pow_decay, 'PowerIters': pow_it, 'PowerTolerance': pow_tol, 'Fast': fast } logger.write_info({**algo_info, **policy.info()}) log_keys = ['Perf', 'UPerf', 'AvgHorizon', 'StepSize', 'GradNorm', 'Time', 'StepSize', 'BatchSize', 'LipConst', 'ErrBound', 'SampleVar', 'Info', 'TotSamples', 'Safety', 'UScore'] if log_params: log_keys += ['param%d' % i for i in range(policy.num_params())] if log_grad: log_keys += ['grad%d' % i for i in range(policy.num_params())] if test_batchsize: log_keys += ['TestPerf', 'TestPerf', 'TestInfo'] log_row = dict.fromkeys(log_keys) logger.open(log_row.keys()) #Initializations it = 0 updated = False updates = 0 unsafe_updates = 0 safety = 1. tot_samples = 0 optimal_batchsize = min_batchsize min_safe_batchsize = min_batchsize _conf = conf _estimator = (reinforce_estimator if estimator=='reinforce' else gpomdp_estimator) old_lip_const = 0. dfn = policy.get_flat().shape[0] min_batchsize = max(min_batchsize, dfn + 1) #Learning loop while(it < iterations and tot_samples < max_samples): start = time.time() if verbose: print('\n* Iteration %d *' % it) params = policy.get_flat() #Test the corresponding deterministic policy if test_batchsize: test_batch = generate_batch(env, policy, horizon, episodes=test_batchsize, action_filter=action_filter, n_jobs=parallel, deterministic=True, key=info_key) log_row['TestPerf'] = performance(test_batch, disc) log_row['UTestPerf'] = performance(test_batch, 1) log_row['TestInfo'] = mean_sum_info(test_batch).item() #Render the agent's behavior if render and it % render==0: generate_batch(env, policy, horizon, episodes=1, action_filter=action_filter, render=True) #Collect trajectories according to target batch size target_batchsize = min_safe_batchsize if fast else optimal_batchsize batch = generate_batch(env, policy, horizon, episodes=max(min_batchsize, min(max_batchsize, target_batchsize)), action_filter=action_filter, n_jobs=parallel, key=info_key) batchsize = len(batch) #Collect more trajectories to match minimum safe batch size do = True while do or batchsize < min_safe_batchsize: do = False batch += generate_batch(env, policy, horizon, episodes=(min(max_batchsize, min_safe_batchsize) - batchsize), action_filter=action_filter, n_jobs=parallel, key=info_key) batchsize = len(batch) #Estimate policy gradient grad_samples = _estimator(batch, disc, policy, baselinekind=baseline, shallow=shallow, result='samples') grad = torch.mean(grad_samples, 0) #Compute estimation error with ellipsoid confidence region centered = grad_samples - grad.unsqueeze(0) grad_cov = (batchsize/(batchsize - 1) * torch.mean(torch.bmm(centered.unsqueeze(2), centered.unsqueeze(1)),0)) grad_var = torch.sum(torch.diag(grad_cov)).item() #for humans max_eigv = eigsh(grad_cov.numpy(), 1)[0][0] quant = sts.f.ppf(1 - _conf, dfn, batchsize - dfn) eps = math.sqrt(max_eigv * dfn * quant) #Optimal batch size optimal_batchsize = torch.ceil(4 * eps**2 / (torch.norm(grad)**2) + dfn).item() min_safe_batchsize = torch.ceil(eps**2 / torch.norm(grad)**2 + dfn).item() target_batchsize = (min_safe_batchsize if fast else optimal_batchsize) if verbose and optimal_batchsize < max_batchsize: print('Collected %d / %d trajectories' % (batchsize, target_batchsize)) elif verbose: print('Collected %d / %d trajectories' % (batchsize, min(max_batchsize, target_batchsize))) #Adjust confidence before collecting more data for the same update if batchsize >= max_batchsize: break _conf /= 2 if verbose: print('Optimal batch size: %d' % (optimal_batchsize if optimal_batchsize < float('inf') else -1)) print('Minimum safe batch size: %d' % (min_safe_batchsize if min_safe_batchsize < float('inf') else -1)) if (batchsize >= min_safe_batchsize and batchsize < optimal_batchsize): print('Low sample regime') #Update safety measure if updates == 0: old_rets= returns(batch, disc) elif updated: new_rets = returns(batch, disc) tscore, pval = sts.ttest_ind(old_rets, new_rets) if pval / 2 < meta_conf and tscore > 0: unsafe_updates += 1 if verbose: print('The previous update was unsafe! (p-value = %f)' % (pval / 2)) old_rets = new_rets safety = 1 - unsafe_updates / updates #Update long-term quantities tot_samples += batchsize #Log log_row['SampleVar'] = grad_var log_row['UScore'] = torch.norm(grad).item() / math.sqrt(grad_var) log_row['Safety'] = safety log_row['ErrBound'] = eps log_row['Perf'] = performance(batch, disc) log_row['Info'] = mean_sum_info(batch).item() log_row['UPerf'] = performance(batch, disc=1.) log_row['AvgHorizon'] = avg_horizon(batch) log_row['GradNorm'] = torch.norm(grad).item() log_row['BatchSize'] = batchsize log_row['TotSamples'] = tot_samples if log_params: for i in range(policy.num_params()): log_row['param%d' % i] = params[i].item() if log_grad: for i in range(policy.num_params()): log_row['grad%d' % i] = grad[i].item() #Check if number of samples is sufficient to perform update if batchsize < min_safe_batchsize: updated = False if verbose: print('No update, would require more samples than allowed') #Log log_row['LipConst'] = old_lip_const log_row['StepSize'] = 0. log_row['Time'] = time.time() - start if verbose: print(separator) logger.write_row(log_row, it) if verbose: print(separator) #Adjust confidence before collecting new data for the same update _conf /= 2 #Skip to next iteration (current trajectories are discarded) it += 1 continue #Reset confidence for next update _conf = conf #Estimate gradient Lipschitz constant with off-policy Power Method lip_const = power(policy, batch, grad, disc, step=pow_step, decay_rate=pow_decay, tol=pow_tol, max_it=pow_it, estimator=_estimator, baseline=baseline, shallow=shallow, clip=pow_clip, verbose=verbose) #Update "global" lipschitz constant if it > 0: lip_const = (1 - forget) * max(lip_const, old_lip_const) + forget * lip_const old_lip_const = lip_const log_row['LipConst'] = lip_const #Select step size stepsize = 1. / lip_const * (1 - eps / (torch.norm(grad) * math.sqrt(batchsize - dfn)).item()) if fast: stepsize *= 2 log_row['StepSize'] = stepsize #Update policy parameters new_params = params + stepsize * grad policy.set_from_flat(new_params) updated = True updates += 1 #Save parameters if save_params and it % save_params == 0: logger.save_params(params, it) #Next iteration log_row['Time'] = time.time() - start if verbose: print(separator) logger.write_row(log_row, it) if verbose: print(separator) it += 1 #Save final parameters if save_params: logger.save_params(params, it) #Cleanup logger.close()