예제 #1
0
def reinforce2(alpha, logsig, env, policy, horizon, *,
              batchsize=100,
              iterations=1000,
              disc=0.99,
              stepper=ConstantStepper(1e-2),
              action_filter=None,
              estimator='gpomdp',
              baseline='avg',
              logger=Logger(name='gpomdp'),
              shallow=False,
              seed=None,
              test_batchsize=False,
              info_key='danger',
              save_params=100,
              log_params=False,
              log_grad=False,
              parallel=False,
              render=False,
              verbose=1):
    """
    REINFORCE/G(PO)MDP algorithmn

    env: environment
    policy: the one to improve
    horizon: maximum task horizon
    batchsize: number of trajectories used to estimate policy gradient
    iterations: number of policy updates
    disc: discount factor
    stepper: step size criterion. A constant step size is used by default
    action_filter: function to apply to the agent's action before feeding it to
        the environment, not considered in gradient estimation. By default,
        the action is clipped to satisfy evironmental boundaries
    estimator: either 'reinforce' or 'gpomdp' (default). The latter typically
        suffers from less variance
    baseline: control variate to be used in the gradient estimator. Either
        'avg' (average reward, default), 'peters' (variance-minimizing) or
        'zero' (no baseline)
    logger: for human-readable logs (standard output, csv, tensorboard...)
    shallow: whether to employ pre-computed score functions (only available for
        shallow policies)
    seed: random seed (None for random behavior)
    test_batchsize: number of test trajectories used to evaluate the
        corresponding deterministic policy at each iteration. If 0 or False, no
        test is performed
    save_params: how often (every x iterations) to save the policy
        parameters to disk. Final parameters are always saved for
        x>0. If False, they are never saved.
    log_params: whether to include policy parameters in the human-readable logs
    log_grad: whether to include gradients in the human-readable logs
    parallel: number of parallel jobs for simulation. If 0 or False,
        sequential simulation is performed.
    render: how often (every x iterations) to render the agent's behavior
        on a sample trajectory. If False, no rendering happens
    verbose: level of verbosity (0: only logs; 1: normal; 2: maximum)
    """

    # Defaults
    if action_filter is None:
        action_filter = clip(env)

    # Seed agent
    if seed is not None:
        seed_all_agent(seed)

    # Prepare logger
    algo_info = {'Algorithm': 'REINFORCE',
                 'Estimator': estimator,
                 'Baseline': baseline,
                 'Env': str(env),
                 'Horizon': horizon,
                 'BatchSize': batchsize,
                 'Disc': disc,
                 'StepSizeCriterion': str(stepper),
                 'Seed': seed,
                 }
    logger.write_info({**algo_info, **policy.info()})
    log_keys = ['Perf',
                'UPerf',
                'AvgHorizon',
                'StepSize',
                'GradNorm',
                'Time',
                'StepSize',
                'Exploration',
                'Info']
    if log_params:
        log_keys += ['param%d' % i for i in range(policy.num_params())]
    if log_grad:
        log_keys += ['grad%d' % i for i in range(policy.num_params())]
    if test_batchsize:
        log_keys += ['TestPerf', 'TestPerf', 'TestInfo']
    log_row = dict.fromkeys(log_keys)
    logger.open(log_row.keys())

    # init image & csv
    filename = "../csv/minigolf/REINFORCE/ALPHA={}/LOGSTD={}/data{}.csv".format(alpha, logsig, seed)
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    data_file = open(filename, mode='w')
    file_writer = csv.writer(data_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

    visualizer = MGVisualizer("MG visualizer", "/minigolf/REINFORCE/ALPHA={}/LOGSTD={}/test{}.png".format(alpha, logsig,
                                                                                                          seed))
    visualizer.clean_panels()

    # PLOTTER INFO
    stats = {}
    stats['w1'] = []
    stats['w2'] = []
    stats['w3'] = []
    stats['w4'] = []
    stats['j'] = []
    stats['fail'] = []
    # ------------

    # Learning loop
    it = 0
    cumulative_fail = 0
    cumulative_j = 0
    while it < iterations:
        # Begin iteration
        start = time.time()
        if verbose:
            print('\nIteration ', it)
        params = policy.get_flat()
        if verbose > 1:
            print('Parameters:', params)

        # Test the corresponding deterministic policy
        if test_batchsize:
            test_batch = generate_batch(env, policy, horizon, test_batchsize,
                                        action_filter=action_filter,
                                        seed=seed,
                                        njobs=parallel,
                                        deterministic=True,
                                        key=info_key)
            log_row['TestPerf'] = performance(test_batch, disc)
            log_row['TestInfo'] = mean_sum_info(test_batch).item()
            log_row['UTestPerf'] = performance(test_batch, 1)

        # Render the agent's behavior
        if render and it % render == 0:
            generate_batch(env, policy, horizon,
                           episodes=1,
                           action_filter=action_filter,
                           render=True,
                           key=info_key)

        # Collect trajectories
        batch = generate_batch(env, policy, horizon, batchsize,
                               action_filter=action_filter,
                               seed=seed,
                               n_jobs=parallel,
                               key=info_key)

        # ------------------- count fails -------------------
        rewards = [b[2] for b in batch]
        failures = [np.count_nonzero(r==-100) for r in rewards]
        cumulative_fail += sum(failures)
        # ---------------------------------------------------

        perf = performance(batch, disc)
        cumulative_j += perf
        log_row['Perf'] = perf
        log_row['Info'] = mean_sum_info(batch).item()
        log_row['UPerf'] = performance(batch, disc=1.)
        log_row['AvgHorizon'] = avg_horizon(batch)
        log_row['Exploration'] = policy.exploration().item()
        log_row['IterationFails'] = sum(failures)
        log_row['CumulativeFails'] = cumulative_fail

        # Estimate policy gradient
        if estimator == 'gpomdp':
            grad = gpomdp_estimator(batch, disc, policy,
                                    baselinekind=baseline,
                                    shallow=shallow)
        elif estimator == 'reinforce':
            grad = reinforce_estimator(batch, disc, policy,
                                       baselinekind=baseline,
                                       shallow=shallow)
        else:
            raise ValueError('Invalid policy gradient estimator')
        if verbose > 1:
            print('Gradients: ', grad)
        log_row['GradNorm'] = torch.norm(grad).item()

        # Select meta-parameters
        stepsize = stepper.next(grad)
        log_row['StepSize'] = torch.norm(torch.tensor(stepsize)).item()

        # Update policy parameters
        new_params = params + stepsize * grad
        policy.set_from_flat(new_params)

        # Log
        log_row['Time'] = time.time() - start
        if log_params:
            for i in range(policy.num_params()):
                log_row['param%d' % i] = params[i].item()
        if log_grad:
            for i in range(policy.num_params()):
                log_row['grad%d' % i] = grad[i].item()
        logger.write_row(log_row, it)

        # Save parameters
        if save_params and it % save_params == 0:
            logger.save_params(params, it)

        print(new_params)
        params = new_params.numpy()[1:]  # updated w

        # update csv & image
        visualizer.show_values(params, perf, cumulative_fail)
        file_writer.writerow([params[0], params[1], params[2], params[3], cumulative_fail, perf])

        # PLOTTER INFO
        # if it % 10 == 0:
        stats['w1'].append(params[0])
        stats['w2'].append(params[1])
        stats['w3'].append(params[2])
        stats['w4'].append(params[3])
        stats['j'].append(perf)
        stats['fail'].append(cumulative_fail)
        # ------------

        # Next iteration
        it += 1

    # Save final parameters
    if save_params:
        logger.save_params(params, it)

    visualizer.save_image()
    # Cleanup
    logger.close()
    return stats, cumulative_j
예제 #2
0
def reinforce(env,
              policy,
              horizon,
              *,
              batchsize=100,
              iterations=1000,
              disc=0.99,
              stepper=ConstantStepper(1e-2),
              action_filter=None,
              estimator='gpomdp',
              baseline='avg',
              logger=Logger(name='gpomdp'),
              shallow=False,
              seed=None,
              test_batchsize=False,
              info_key='danger',
              save_params=100,
              log_params=False,
              log_grad=False,
              parallel=False,
              render=False,
              verbose=1):
    """
    REINFORCE/G(PO)MDP algorithmn
        
    env: environment
    policy: the one to improve
    horizon: maximum task horizon
    batchsize: number of trajectories used to estimate policy gradient
    iterations: number of policy updates
    disc: discount factor
    stepper: step size criterion. A constant step size is used by default
    action_filter: function to apply to the agent's action before feeding it to 
        the environment, not considered in gradient estimation. By default,
        the action is clipped to satisfy evironmental boundaries
    estimator: either 'reinforce' or 'gpomdp' (default). The latter typically
        suffers from less variance
    baseline: control variate to be used in the gradient estimator. Either
        'avg' (average reward, default), 'peters' (variance-minimizing) or
        'zero' (no baseline)
    logger: for human-readable logs (standard output, csv, tensorboard...)
    shallow: whether to employ pre-computed score functions (only available for
        shallow policies)
    seed: random seed (None for random behavior)
    test_batchsize: number of test trajectories used to evaluate the 
        corresponding deterministic policy at each iteration. If 0 or False, no 
        test is performed
    save_params: how often (every x iterations) to save the policy 
        parameters to disk. Final parameters are always saved for 
        x>0. If False, they are never saved.
    log_params: whether to include policy parameters in the human-readable logs
    log_grad: whether to include gradients in the human-readable logs
    parallel: number of parallel jobs for simulation. If 0 or False, 
        sequential simulation is performed.
    render: how often (every x iterations) to render the agent's behavior
        on a sample trajectory. If False, no rendering happens
    verbose: level of verbosity (0: only logs; 1: normal; 2: maximum)
    """
    #Defaults
    if action_filter is None:
        action_filter = clip(env)

    #Seed agent
    if seed is not None:
        seed_all_agent(seed)

    #Prepare logger
    algo_info = {
        'Algorithm': 'REINFORCE',
        'Estimator': estimator,
        'Baseline': baseline,
        'Env': str(env),
        'Horizon': horizon,
        'BatchSize': batchsize,
        'Disc': disc,
        'StepSizeCriterion': str(stepper),
        'Seed': seed,
    }
    logger.write_info({**algo_info, **policy.info()})
    log_keys = [
        'Perf', 'UPerf', 'AvgHorizon', 'StepSize', 'GradNorm', 'Time',
        'StepSize', 'Exploration', 'Info'
    ]
    if log_params:
        log_keys += ['param%d' % i for i in range(policy.num_params())]
    if log_grad:
        log_keys += ['grad%d' % i for i in range(policy.num_params())]
    if test_batchsize:
        log_keys += ['TestPerf', 'TestPerf', 'TestInfo']
    log_row = dict.fromkeys(log_keys)
    logger.open(log_row.keys())

    #Learning loop
    it = 0
    while (it < iterations):
        #Begin iteration
        start = time.time()
        if verbose:
            print('\nIteration ', it)
        params = policy.get_flat()
        if verbose > 1:
            print('Parameters:', params)

        #Test the corresponding deterministic policy
        if test_batchsize:
            test_batch = generate_batch(env,
                                        policy,
                                        horizon,
                                        test_batchsize,
                                        action_filter=action_filter,
                                        seed=seed,
                                        njobs=parallel,
                                        deterministic=True,
                                        key=info_key)
            log_row['TestPerf'] = performance(test_batch, disc)
            log_row['TestInfo'] = mean_sum_info(test_batch).item()
            log_row['UTestPerf'] = performance(test_batch, 1)

        #Render the agent's behavior
        if render and it % render == 0:
            generate_batch(env,
                           policy,
                           horizon,
                           episodes=1,
                           action_filter=action_filter,
                           render=True,
                           key=info_key)

        #Collect trajectories
        batch = generate_batch(env,
                               policy,
                               horizon,
                               batchsize,
                               action_filter=action_filter,
                               seed=seed,
                               n_jobs=parallel,
                               key=info_key)
        log_row['Perf'] = performance(batch, disc)
        log_row['Info'] = mean_sum_info(batch).item()
        log_row['UPerf'] = performance(batch, disc=1.)
        log_row['AvgHorizon'] = avg_horizon(batch)
        log_row['Exploration'] = policy.exploration().item()

        #Estimate policy gradient
        if estimator == 'gpomdp':
            grad = gpomdp_estimator(batch,
                                    disc,
                                    policy,
                                    baselinekind=baseline,
                                    shallow=shallow)
        elif estimator == 'reinforce':
            grad = reinforce_estimator(batch,
                                       disc,
                                       policy,
                                       baselinekind=baseline,
                                       shallow=shallow)
        else:
            raise ValueError('Invalid policy gradient estimator')
        if verbose > 1:
            print('Gradients: ', grad)
        log_row['GradNorm'] = torch.norm(grad).item()

        #Select meta-parameters
        stepsize = stepper.next(grad)
        log_row['StepSize'] = torch.norm(torch.tensor(stepsize)).item()

        #Update policy parameters
        new_params = params + stepsize * grad
        policy.set_from_flat(new_params)

        #Log
        log_row['Time'] = time.time() - start
        if log_params:
            for i in range(policy.num_params()):
                log_row['param%d' % i] = params[i].item()
        if log_grad:
            for i in range(policy.num_params()):
                log_row['grad%d' % i] = grad[i].item()
        logger.write_row(log_row, it)

        #Save parameters
        if save_params and it % save_params == 0:
            logger.save_params(params, it)

        #Next iteration
        it += 1

    #Save final parameters
    if save_params:
        logger.save_params(params, it)

    #Cleanup
    logger.close()
예제 #3
0
def adastep(env,
            policy,
            horizon,
            pen_coeff,
            var_bound,
            *,
            conf=0.2,
            batchsize=5000,
            iterations=float('inf'),
            max_samples=1e6,
            disc=0.9,
            action_filter=None,
            estimator='gpomdp',
            baseline='peters',
            logger=Logger(name='AdaStep'),
            shallow=True,
            meta_conf=0.05,
            seed=None,
            test_batchsize=False,
            info_key='danger',
            save_params=100,
            log_params=True,
            log_grad=False,
            parallel=False,
            render=False,
            verbose=1):
    """
    Safe PG algorithm from "Adaptive Step Size for Policy Gradient Methods", 
                        Pirotta et al., 2013.
    Only for Gaussian policies.
        
    env: environment
    policy: the one to improve
    horizon: maximum task horizon
    pen_coeff: penalty coefficient for policy update
    var_bound: upper bound on the variance of the PG estimator
    conf: probability of failure
    batchsize: number of trajectories to estimate policy gradient
    iterations: maximum number of learning iterations
    max_samples: maximum number of total trajectories
    disc: discount factor
    action_filter: function to apply to the agent's action before feeding it to 
        the environment, not considered in gradient estimation. By default,
        the action is clipped to satisfy evironmental boundaries
    estimator: either 'reinforce' or 'gpomdp' (default). The latter typically
        suffers from less variance
    baseline: control variate to be used in the gradient estimator. Either
        'avg' (average reward, default), 'peters' (variance-minimizing) or
        'zero' (no baseline)
    logger: for human-readable logs (standard output, csv, tensorboard...)
    shallow: whether to employ pre-computed score functions (only available for
        shallow policies)
    meta_conf: confidence level of safe-update test (for evaluation only)
    seed: random seed (None for random behavior)
    test_batchsize: number of test trajectories used to evaluate the 
        corresponding deterministic policy at each iteration. If 0 or False, no 
        test is performed
    info_key: name of the environment info to log
    save_params: how often (every x iterations) to save the policy 
        parameters to disk. Final parameters are always saved for 
        x>0. If False, they are never saved.
    log_params: whether to include policy parameters in the human-readable logs
    log_grad: whether to include gradients in the human-readable logs
    parallel: number of parallel jobs for simulation. If 0 or False, 
        sequential simulation is performed.
    render: how often (every x iterations) to render the agent's behavior
        on a sample trajectory. If False, no rendering happens
    verbose: level of verbosity on standard output
    """
    #Defaults
    if action_filter is None:
        action_filter = clip(env)

    #Seed agent
    if seed is not None:
        seed_all_agent(seed)

    #Prepare logger
    algo_info = {
        'Algorithm': 'AdaStep',
        'Estimator': estimator,
        'Baseline': baseline,
        'Env': str(env),
        'Horizon': horizon,
        'Discount': disc,
        'Confidence': conf,
        'ConfidenceParam': conf,
        'Seed': seed,
        'BatchSize': batchsize,
        'PenalizationCoefficient': pen_coeff,
        'VarianceBound': var_bound
    }
    logger.write_info({**algo_info, **policy.info()})
    log_keys = [
        'Perf', 'UPerf', 'AvgHorizon', 'StepSize', 'GradNorm', 'Time',
        'StepSize', 'BatchSize', 'Info', 'TotSamples', 'Safety'
    ]
    if log_params:
        log_keys += ['param%d' % i for i in range(policy.num_params())]
    if log_grad:
        log_keys += ['grad%d' % i for i in range(policy.num_params())]
    if test_batchsize:
        log_keys += ['TestPerf', 'TestPerf', 'TestInfo']
    log_row = dict.fromkeys(log_keys)
    logger.open(log_row.keys())

    #Initializations
    it = 0
    tot_samples = 0
    safety = 1.
    _estimator = (reinforce_estimator
                  if estimator == 'reinforce' else gpomdp_estimator)
    updated = False
    updates = 0
    unsafe_updates = 0
    eps = math.sqrt(var_bound / conf)

    #Learning loop
    while (it < iterations and tot_samples < max_samples):
        start = time.time()
        if verbose:
            print('\n* Iteration %d *' % it)
        params = policy.get_flat()

        #Test the corresponding deterministic policy
        if test_batchsize:
            test_batch = generate_batch(env,
                                        policy,
                                        horizon,
                                        episodes=test_batchsize,
                                        action_filter=action_filter,
                                        n_jobs=parallel,
                                        deterministic=True,
                                        key=info_key)
            log_row['TestPerf'] = performance(test_batch, disc)
            log_row['UTestPerf'] = performance(test_batch, 1)
            log_row['TestInfo'] = mean_sum_info(test_batch).item()

        #Render the agent's behavior
        if render and it % render == 0:
            generate_batch(env,
                           policy,
                           horizon,
                           episodes=1,
                           action_filter=action_filter,
                           render=True)

        #Collect trajectories according to fixed batch size
        batch = generate_batch(env,
                               policy,
                               horizon,
                               episodes=batchsize,
                               action_filter=action_filter,
                               n_jobs=parallel,
                               key=info_key)

        #Estimate policy gradient
        grad_samples = _estimator(batch,
                                  disc,
                                  policy,
                                  baselinekind=baseline,
                                  shallow=shallow,
                                  result='samples')
        grad = torch.mean(grad_samples, 0)

        lower = torch.clamp(
            torch.abs(grad) - eps / math.sqrt(batchsize), 0, float('inf'))
        upper = torch.abs(grad) + eps / math.sqrt(batchsize)

        #Update long-term quantities
        tot_samples += batchsize

        #Update safety measure
        if updates == 0:
            old_rets = returns(batch, disc)
        elif updated:
            new_rets = returns(batch, disc)
            tscore, pval = sts.ttest_ind(old_rets, new_rets)
            if pval / 2 < meta_conf and tscore > 0:
                unsafe_updates += 1
                if verbose:
                    print('The previous update was unsafe! (p-value = %f)' %
                          (pval / 2))
            old_rets = new_rets
            safety = 1 - unsafe_updates / updates

        #Log
        log_row['Safety'] = safety
        log_row['Perf'] = performance(batch, disc)
        log_row['Info'] = mean_sum_info(batch).item()
        log_row['UPerf'] = performance(batch, disc=1.)
        log_row['AvgHorizon'] = avg_horizon(batch)
        log_row['GradNorm'] = torch.norm(grad).item()
        log_row['BatchSize'] = batchsize
        log_row['TotSamples'] = tot_samples
        if log_params:
            for i in range(policy.num_params()):
                log_row['param%d' % i] = params[i].item()
        if log_grad:
            for i in range(policy.num_params()):
                log_row['grad%d' % i] = grad[i].item()

        #Check if number of samples is sufficient to perform update
        if torch.norm(lower) == 0:
            updated = False
            if verbose:
                print('No update, would require more samples')

        #Select step size
        stepsize = (torch.norm(lower)**2 /
                    (2 * pen_coeff * torch.sum(upper)**2)).item()
        log_row['StepSize'] = stepsize

        #Update policy parameters
        new_params = params + stepsize * grad
        policy.set_from_flat(new_params)
        updated = True
        updates += 1

        #Save parameters
        if save_params and it % save_params == 0:
            logger.save_params(params, it)

        #Next iteration
        log_row['Time'] = time.time() - start
        if verbose:
            print(separator)
        logger.write_row(log_row, it)
        if verbose:
            print(separator)

        it += 1

    #Save final parameters
    if save_params:
        logger.save_params(params, it)

    #Cleanup
    logger.close()
예제 #4
0
def adabatch(env,
             policy,
             horizon,
             pen_coeff,
             *,
             bound='chebyshev',
             var_bound=None,
             grad_range=None,
             conf=0.2,
             min_batchsize=32,
             max_batchsize=10000,
             iterations=float('inf'),
             max_samples=1e6,
             disc=0.9,
             action_filter=None,
             estimator='gpomdp',
             baseline='peters',
             logger=Logger(name='AdaBatch'),
             shallow=True,
             meta_conf=0.05,
             seed=None,
             test_batchsize=False,
             info_key='danger',
             save_params=100,
             log_params=True,
             log_grad=False,
             parallel=False,
             render=False,
             verbose=1):
    """
    Safe PG algorithm from "Adaptive Batch Size for Safe Policy Gradients",
                        Papini et al., 2017.
    Only for Gaussian policies.
        
    env: environment
    policy: the one to improve
    horizon: maximum task horizon
    pen_coeff: penalty coefficient for policy update
    bound: statistical inequality used to determine optimal batchsize 
        (chebyshev/student/hoeffding/bernstein)
    var_bound: upper bound on the variance of the PG estimator. Must not be 
        None if Chebyshev's bound is employed
    grad_range: theoretical range of gradient estimate. If none, it is 
        estimated from data (in a biased way)
    conf: probability of failure
    min_batchsize: minimum number of trajectories to estimate policy gradient
    max_batchsize: maximum number of trajectories to estimate policy gradient
    iterations: number of policy updates
    max_samples: maximum number of total trajectories
    disc: discount factor
    action_filter: function to apply to the agent's action before feeding it to 
        the environment, not considered in gradient estimation. By default,
        the action is clipped to satisfy evironmental boundaries
    estimator: either 'reinforce' or 'gpomdp' (default). The latter typically
        suffers from less variance
    baseline: control variate to be used in the gradient estimator. Either
        'avg' (average reward, default), 'peters' (variance-minimizing) or
        'zero' (no baseline)
    logger: for human-readable logs (standard output, csv, tensorboard...)
    shallow: whether to employ pre-computed score functions (only available for
        shallow policies)
    meta_conf: confidence level of safe-update test (for evaluation only)
    seed: random seed (None for random behavior)
    test_batchsize: number of test trajectories used to evaluate the 
        corresponding deterministic policy at each iteration. If 0 or False, no 
        test is performed
    info_key: name of the environment info to log
    save_params: how often (every x iterations) to save the policy 
        parameters to disk. Final parameters are always saved for 
        x>0. If False, they are never saved.
    log_params: whether to include policy parameters in the human-readable logs
    log_grad: whether to include gradients in the human-readable logs
    parallel: number of parallel jobs for simulation. If 0 or False, 
        sequential simulation is performed.
    render: how often (every x iterations) to render the agent's behavior
        on a sample trajectory. If False, no rendering happens
    verbose: level of verbosity
    """
    #Defaults
    if action_filter is None:
        action_filter = clip(env)
    if bound == 'chebyshev' and var_bound is None:
        raise NotImplementedError
    empirical_range = (grad_range is None)

    #Seed agent
    if seed is not None:
        seed_all_agent(seed)

    #Prepare logger
    algo_info = {
        'Algorithm': 'AdaBatch',
        'Estimator': estimator,
        'Baseline': baseline,
        'Env': str(env),
        'Horizon': horizon,
        'Discount': disc,
        'Confidence': conf,
        'ConfidenceParam': conf,
        'Seed': seed,
        'MinBatchSize': min_batchsize,
        'MaxBatchSize': max_batchsize,
        'PenalizationCoefficient': pen_coeff,
        'VarianceBound': var_bound,
        'Bound': bound
    }
    logger.write_info({**algo_info, **policy.info()})
    log_keys = [
        'Perf', 'UPerf', 'AvgHorizon', 'StepSize', 'GradNorm', 'Time',
        'StepSize', 'BatchSize', 'Info', 'TotSamples', 'GradVar', 'GradRange',
        'Safety', 'Err', 'GradInfNorm'
    ]
    if log_params:
        log_keys += ['param%d' % i for i in range(policy.num_params())]
    if log_grad:
        log_keys += ['grad%d' % i for i in range(policy.num_params())]
    if test_batchsize:
        log_keys += ['TestPerf', 'TestPerf', 'TestInfo']
    log_row = dict.fromkeys(log_keys)
    logger.open(log_row.keys())

    #Initializations
    it = 0
    tot_samples = 0
    safety = 1.
    optimal_batchsize = min_batchsize
    _estimator = (reinforce_estimator
                  if estimator == 'reinforce' else gpomdp_estimator)
    updated = False
    updates = 0
    unsafe_updates = 0
    params = policy.get_flat()
    max_grad = torch.zeros_like(params) - float('inf')
    min_grad = torch.zeros_like(params) + float('inf')

    #Learning loop
    while (it < iterations and tot_samples < max_samples):
        start = time.time()
        if verbose:
            print('\n* Iteration %d *' % it)
        params = policy.get_flat()

        #Test the corresponding deterministic policy
        if test_batchsize:
            test_batch = generate_batch(env,
                                        policy,
                                        horizon,
                                        episodes=test_batchsize,
                                        action_filter=action_filter,
                                        n_jobs=parallel,
                                        deterministic=True,
                                        key=info_key)
            log_row['TestPerf'] = performance(test_batch, disc)
            log_row['UTestPerf'] = performance(test_batch, 1)
            log_row['TestInfo'] = mean_sum_info(test_batch).item()

        #Render the agent's behavior
        if render and it % render == 0:
            generate_batch(env,
                           policy,
                           horizon,
                           episodes=1,
                           action_filter=action_filter,
                           render=True)

        #Collect trajectories according to previous optimal batch size
        batch = generate_batch(env,
                               policy,
                               horizon,
                               episodes=max(
                                   min_batchsize,
                                   min(max_batchsize, optimal_batchsize)),
                               action_filter=action_filter,
                               n_jobs=parallel,
                               key=info_key)
        batchsize = len(batch)

        #Estimate policy gradient
        grad_samples = _estimator(batch,
                                  disc,
                                  policy,
                                  baselinekind=baseline,
                                  shallow=shallow,
                                  result='samples')
        grad = torch.mean(grad_samples, 0)
        grad_infnorm = torch.max(torch.abs(grad))
        coordinate = torch.min(torch.argmax(torch.abs(grad))).item()

        #Compute statistics for estimation error
        if bound in ['bernstein', 'student']:
            grad_var = torch.var(grad_samples, 0, unbiased=True)
            grad_var = torch.max(grad_var).item()
            log_row['GradVar'] = grad_var
        else:
            log_row['GradVar'] = var_bound
        if bound in ['bernstein', 'hoeffding'] and empirical_range:
            max_grad = torch.max(grad, max_grad)
            min_grad = torch.min(min_grad, grad)
            grad_range = torch.max(max_grad - min_grad).item()
            if grad_range <= 0:
                grad_range = torch.max(2 * abs(max_grad)).item()
        log_row['GradRange'] = grad_range

        #Compute estimation error
        if bound == 'chebyshev':
            eps = math.sqrt(var_bound / conf)
        elif bound == 'student':
            quant = sts.t.ppf(1 - conf, batchsize)
            eps = quant * math.sqrt(grad_var)
        elif bound == 'hoeffding':
            eps = grad_range * math.sqrt(math.log(2. / conf) / 2)
        elif bound == 'bernstein':
            eps = math.sqrt(2 * grad_var * math.log(3. / conf))
            eps2 = 3 * grad_range * math.log(3. / conf)

        #Compute optimal batch size
        if bound in ['chebyshev', 'student', 'hoeffding']:
            optimal_batchsize = math.ceil(((13 + 3 * math.sqrt(17)) * eps**2 /
                                           (2 * grad_infnorm**2)).item())
            min_safe_batchsize = math.ceil((eps**2 / grad_infnorm**2).item())
        else:
            min_safe_batchsize = math.ceil(
                ((eps + math.sqrt(eps**2 + 4 * eps2 * grad_infnorm)) /
                 (2 * grad_infnorm))**2)
            optimal_batchsize = min_safe_batchsize
            _stepsize = ((grad_infnorm - eps / math.sqrt(optimal_batchsize) -
                          eps2 / optimal_batchsize)**2 /
                         (2 * pen_coeff *
                          (grad_infnorm + eps / math.sqrt(optimal_batchsize) +
                           eps2 / optimal_batchsize)**2)).item()
            ups = (grad_infnorm**2 * _stepsize * (1 - pen_coeff * _stepsize) /
                   optimal_batchsize)
            old_ups = -float('inf')
            while ups > old_ups:
                optimal_batchsize += 1
                old_ups = ups
                _stepsize = (
                    (grad_infnorm - eps / math.sqrt(optimal_batchsize) -
                     eps2 / optimal_batchsize)**2 /
                    (2 * pen_coeff *
                     (grad_infnorm + eps / math.sqrt(optimal_batchsize) +
                      eps2 / optimal_batchsize)**2)).item()
                ups = (grad_infnorm**2 * _stepsize *
                       (1 - pen_coeff * _stepsize) / optimal_batchsize)
            optimal_batchsize -= 1

        if verbose:
            print('Optimal batch size: %d' % optimal_batchsize)

        #Update long-term quantities
        tot_samples += batchsize

        #Update safety measure
        if updates == 0:
            old_rets = returns(batch, disc)
        elif updated:
            new_rets = returns(batch, disc)
            tscore, pval = sts.ttest_ind(old_rets, new_rets)
            if pval / 2 < meta_conf and tscore > 0:
                unsafe_updates += 1
                if verbose:
                    print('The previous update was unsafe! (p-value = %f)' %
                          (pval / 2))
            old_rets = new_rets
            safety = 1 - unsafe_updates / updates

        #Log
        log_row['Err'] = eps
        log_row['Safety'] = safety
        log_row['Perf'] = performance(batch, disc)
        log_row['Info'] = mean_sum_info(batch).item()
        log_row['UPerf'] = performance(batch, disc=1.)
        log_row['AvgHorizon'] = avg_horizon(batch)
        log_row['GradNorm'] = torch.norm(grad).item()
        log_row['GradInfNorm'] = grad_infnorm.item()
        log_row['BatchSize'] = batchsize
        log_row['TotSamples'] = tot_samples
        if log_params:
            for i in range(policy.num_params()):
                log_row['param%d' % i] = params[i].item()
        if log_grad:
            for i in range(policy.num_params()):
                log_row['grad%d' % i] = grad[i].item()

        #Check if number of samples is sufficient to perform update
        if grad_infnorm < eps / math.sqrt(batchsize):
            updated = False
            if verbose:
                print('No update, need more samples')

            #Log
            log_row['StepSize'] = 0.
            log_row['Time'] = time.time() - start
            if verbose:
                print(separator)
            logger.write_row(log_row, it)
            if verbose:
                print(separator)

            #Skip to next iteration (current trajectories are discarded)
            it += 1
            continue

        #Select step size
        if bound == 'bernstein':
            stepsize = ((grad_infnorm - eps / math.sqrt(batchsize) -
                         eps2 / batchsize)**2 /
                        (2 * pen_coeff *
                         (grad_infnorm + eps / math.sqrt(batchsize) +
                          eps2 / batchsize)**2)).item()
        else:
            stepsize = (13 - 3 * math.sqrt(17)) / (4 * pen_coeff)
        log_row['StepSize'] = stepsize

        #Update policy parameters
        new_params = params
        new_params[coordinate] = (params[coordinate] +
                                  stepsize * grad[coordinate])
        policy.set_from_flat(new_params)
        updated = True
        updates += 1

        #Save parameters
        if save_params and it % save_params == 0:
            logger.save_params(params, it)

        #Next iteration
        log_row['Time'] = time.time() - start
        if verbose:
            print(separator)
        logger.write_row(log_row, it)
        if verbose:
            print(separator)
        it += 1

    #Save final parameters
    if save_params:
        logger.save_params(params, it)

    #Cleanup
    logger.close()
예제 #5
0
def semisafepg(env, policy, horizon, *,
                    conf = 0.05,
                    min_batchsize = 32,
                    max_batchsize = 5000,
                    iterations = float('inf'),
                    max_samples = 1e6,
                    disc = 0.9,
                    forget = 0.1,
                    action_filter = None,
                    estimator = 'gpomdp',
                    baseline = 'peters',
                    logger = Logger(name='SSPG'),
                    shallow = True,
                    pow_step = 0.01,
                    pow_decay = 0.99,
                    pow_it = 100,
                    pow_tol = 0.05,
                    pow_clip = 0.1,
                    fast = False,
                    meta_conf = 0.05,
                    seed = None,
                    test_batchsize = False,
                    info_key = 'danger',
                    save_params = 100,
                    log_params = True,
                    log_grad = False,
                    parallel = False,
                    render = False,
                    verbose = 1):
    """
    Semi-safe PG algorithm from "Smoothing Policies and Safe Policy Gradients,
                                    Papini et al., 2019
        
    env: environment
    policy: the one to improve
    horizon: maximum task horizon
    conf: probability of unsafety (per update)
    min_batchsize: minimum number of trajectories used to estimate policy 
        gradient
    max_batchsize: maximum number of trajectories used to estimate policy 
        gradient
    iterations: maximum number of learning iterations
    max_samples: maximum number of total trajectories 
    disc: discount factor
    forget: decay of the (estimated) global gradient Lipscthiz constant
    action_filter: function to apply to the agent's action before feeding it to 
        the environment, not considered in gradient estimation. By default,
        the action is clipped to satisfy evironmental boundaries
    estimator: either 'reinforce' or 'gpomdp' (default). The latter typically
        suffers from less variance
    baseline: control variate to be used in the gradient estimator. Either
        'avg' (average reward, default), 'peters' (variance-minimizing) or
        'zero' (no baseline)
    logger: for human-readable logs (standard output, csv, tensorboard)
    shallow: whether to employ pre-computed score functions (only available for
        shallow policies)
    pow_step: step size of the power method
    pow_decay: initial decay parameter of the power method
    pow_it: maximum number of iterations (per epoch) of the power method
    pow_tol: relative-error tolerance of the power method
    pow_clip: importance-weight clipping parameter for the power method 
        (default 0.2)
    fast: whether to pursue maximum convergence speed 
        (under safety constraints)
    meta_conf: confidence level of safe update test (for evaluation)
    seed: random seed (None for random behavior)
    test_batchsize: number of test trajectories used to evaluate the 
        corresponding deterministic policy at each iteration. If False, no 
        test is performed
    info_key: name of the environment info to log
    save_params: how often (every x iterations) to save the policy 
        parameters to disk. Final parameters are always saved for 
        x>0. If False, they are never saved.
    log_params: whether to include policy parameters in the human-readable logs
    log_grad: whether to include gradients in the human-readable logs
    parallel: number of parallel jobs for simulation. If False, 
        sequential simulation is performed.
    render: how often (every x iterations) to render the agent's behavior
        on a sample trajectory. If False, no rendering happens
    verbose: level of verbosity
    """
    #Defaults
    if action_filter is None:
        action_filter = clip(env)
    
    #Seed agent
    if seed is not None:
        seed_all_agent(seed)
    
    #Prepare logger
    algo_info = {'Algorithm': 'SSPG',
                   'Estimator': estimator,
                   'Baseline': baseline,
                   'Env': str(env), 
                   'Horizon': horizon,
                   'Discount': disc,
                   'Confidence': conf,
                   'ConfidenceParam': conf,
                   'Seed': seed,
                   'MinBatchSize': min_batchsize,
                   'MaxBatchSize': max_batchsize,
                   'ForgetParam': forget,
                   'PowerStep': pow_step,
                   'PowerDecay': pow_decay,
                   'PowerIters': pow_it,
                   'PowerTolerance': pow_tol,
                   'Fast': fast
                   }
    logger.write_info({**algo_info, **policy.info()})
    log_keys = ['Perf', 
                'UPerf', 
                'AvgHorizon', 
                'StepSize', 
                'GradNorm', 
                'Time',
                'StepSize',
                'BatchSize',
                'LipConst',
                'ErrBound',
                'SampleVar',
                'Info',
                'TotSamples',
                'Safety',
                'UScore']
    if log_params:
        log_keys += ['param%d' % i for i in range(policy.num_params())]
    if log_grad:
        log_keys += ['grad%d' % i for i in range(policy.num_params())]
    if test_batchsize:
        log_keys += ['TestPerf', 'TestPerf', 'TestInfo']
    log_row = dict.fromkeys(log_keys)
    logger.open(log_row.keys())
    
    #Initializations
    it = 0
    updated = False
    updates = 0
    unsafe_updates = 0
    safety = 1.
    tot_samples = 0
    optimal_batchsize = min_batchsize
    min_safe_batchsize = min_batchsize
    _conf = conf
    _estimator = (reinforce_estimator 
                  if estimator=='reinforce' else gpomdp_estimator)
    old_lip_const = 0.
    dfn = policy.get_flat().shape[0]
    min_batchsize = max(min_batchsize, dfn + 1)
    
    #Learning loop
    while(it < iterations and tot_samples < max_samples):
        start = time.time()
        if verbose:
            print('\n* Iteration %d *' % it)
        params = policy.get_flat()
        
        #Test the corresponding deterministic policy
        if test_batchsize:
            test_batch = generate_batch(env, policy, horizon, 
                                        episodes=test_batchsize, 
                                        action_filter=action_filter,
                                        n_jobs=parallel,
                                        deterministic=True,
                                        key=info_key)
            log_row['TestPerf'] = performance(test_batch, disc)
            log_row['UTestPerf'] = performance(test_batch, 1)
            log_row['TestInfo'] = mean_sum_info(test_batch).item()
        
        #Render the agent's behavior
        if render and it % render==0:
            generate_batch(env, policy, horizon,
                           episodes=1,
                           action_filter=action_filter, 
                           render=True)
    
        #Collect trajectories according to target batch size
        target_batchsize = min_safe_batchsize if fast else optimal_batchsize
        batch = generate_batch(env, policy, horizon, 
                                episodes=max(min_batchsize, 
                                             min(max_batchsize, 
                                                 target_batchsize)), 
                                action_filter=action_filter,
                                n_jobs=parallel,
                                key=info_key)
        batchsize = len(batch)
        
        #Collect more trajectories to match minimum safe batch size
        do = True
        while do or batchsize < min_safe_batchsize:
            do = False
            batch += generate_batch(env, policy, horizon, 
                        episodes=(min(max_batchsize, min_safe_batchsize) 
                                    - batchsize), 
                        action_filter=action_filter,
                        n_jobs=parallel,
                        key=info_key)
            batchsize = len(batch)
            
            #Estimate policy gradient
            grad_samples = _estimator(batch, disc, policy, 
                                        baselinekind=baseline, 
                                        shallow=shallow,
                                        result='samples')
            grad = torch.mean(grad_samples, 0)
                
            #Compute estimation error with ellipsoid confidence region
            centered = grad_samples - grad.unsqueeze(0)
            grad_cov = (batchsize/(batchsize - 1) * 
                        torch.mean(torch.bmm(centered.unsqueeze(2), 
                                             centered.unsqueeze(1)),0))
            grad_var = torch.sum(torch.diag(grad_cov)).item() #for humans
            max_eigv = eigsh(grad_cov.numpy(), 1)[0][0]
            quant = sts.f.ppf(1 - _conf, dfn, batchsize - dfn)
            eps = math.sqrt(max_eigv * dfn * quant)
            
            #Optimal batch size
            optimal_batchsize = torch.ceil(4 * eps**2 / 
                                   (torch.norm(grad)**2) + dfn).item()
            min_safe_batchsize = torch.ceil(eps**2 / 
                                            torch.norm(grad)**2 + dfn).item()
            target_batchsize = (min_safe_batchsize if fast 
                                else optimal_batchsize)
            if verbose and optimal_batchsize < max_batchsize:
                print('Collected %d / %d trajectories' 
                      % (batchsize, target_batchsize))
            elif verbose:
                print('Collected %d / %d trajectories' 
                      % (batchsize, min(max_batchsize, target_batchsize)))
            
            #Adjust confidence before collecting more data for the same update
            if batchsize >= max_batchsize:
                break
            _conf /= 2
        
        if verbose:
            print('Optimal batch size: %d' 
                  % (optimal_batchsize if optimal_batchsize < float('inf') 
                      else -1))
            print('Minimum safe batch size: %d' 
                  % (min_safe_batchsize if min_safe_batchsize < float('inf') 
                      else -1))
            if (batchsize >= min_safe_batchsize 
                and batchsize < optimal_batchsize):
                print('Low sample regime')
                
        #Update safety measure
        if updates == 0:
            old_rets= returns(batch, disc)
        elif updated:
            new_rets = returns(batch, disc)
            tscore, pval = sts.ttest_ind(old_rets, new_rets)
            if pval / 2 < meta_conf and tscore > 0:
                unsafe_updates += 1
                if verbose:
                    print('The previous update was unsafe! (p-value = %f)' 
                          % (pval / 2))
            old_rets = new_rets
            safety = 1 - unsafe_updates / updates

        #Update long-term quantities
        tot_samples += batchsize
        
        #Log
        log_row['SampleVar'] = grad_var
        log_row['UScore'] = torch.norm(grad).item() / math.sqrt(grad_var)
        log_row['Safety'] = safety
        log_row['ErrBound'] = eps
        log_row['Perf'] = performance(batch, disc)
        log_row['Info'] = mean_sum_info(batch).item()
        log_row['UPerf'] = performance(batch, disc=1.)
        log_row['AvgHorizon'] = avg_horizon(batch)
        log_row['GradNorm'] = torch.norm(grad).item()
        log_row['BatchSize'] = batchsize
        log_row['TotSamples'] = tot_samples
        if log_params:
            for i in range(policy.num_params()):
                log_row['param%d' % i] = params[i].item()
        if log_grad:
            for i in range(policy.num_params()):
                log_row['grad%d' % i] = grad[i].item()
                
        #Check if number of samples is sufficient to perform update
        if batchsize < min_safe_batchsize:
            updated = False
            if verbose:
                print('No update, would require more samples than allowed')
            #Log
            log_row['LipConst'] = old_lip_const
            log_row['StepSize'] = 0.
            log_row['Time'] = time.time() - start
            if verbose:
                print(separator)
            logger.write_row(log_row, it)
            if verbose:
                print(separator)
            
            #Adjust confidence before collecting new data for the same update
            _conf /= 2
            
            #Skip to next iteration (current trajectories are discarded)
            it += 1
            continue
        
        #Reset confidence for next update
        _conf = conf
        
        #Estimate gradient Lipschitz constant with off-policy Power Method
        lip_const = power(policy, batch, grad, disc, 
              step=pow_step, 
              decay_rate=pow_decay,
              tol=pow_tol, 
              max_it=pow_it, 
              estimator=_estimator, 
              baseline=baseline, 
              shallow=shallow, 
              clip=pow_clip,
              verbose=verbose)
        
        #Update "global" lipschitz constant
        if it > 0:
            lip_const = (1 - forget) * max(lip_const, old_lip_const) + forget * lip_const
        old_lip_const = lip_const
        log_row['LipConst'] = lip_const
        
        #Select step size
        stepsize = 1. / lip_const * (1 - eps / (torch.norm(grad) 
                                        * math.sqrt(batchsize - dfn)).item())
        if fast:
            stepsize *= 2
        log_row['StepSize'] = stepsize
                
        #Update policy parameters
        new_params = params + stepsize * grad
        policy.set_from_flat(new_params)
        updated = True
        updates += 1
        
        #Save parameters
        if save_params and it % save_params == 0:
            logger.save_params(params, it)
        
        #Next iteration
        log_row['Time'] = time.time() - start
        if verbose:
            print(separator)
        logger.write_row(log_row, it)
        if verbose:
            print(separator)
        it += 1
    
    #Save final parameters
    if save_params:
        logger.save_params(params, it)
    
    #Cleanup
    logger.close()