예제 #1
0
def parallel_episode_generator(env,
                               policy,
                               horizon=float('inf'),
                               action_filter=None,
                               seed=None,
                               deterministic=False,
                               key=None):
    ds = sum(env.observation_space.shape)
    ds = max(ds, 1)
    da = sum(env.action_space.shape)
    da = max(da, 1)

    env.seed(seed)
    seed_all_agent(seed)
    states = torch.zeros((horizon, ds), dtype=torch.float)
    actions = torch.zeros((horizon, da), dtype=torch.float)
    rewards = torch.zeros(horizon, dtype=torch.float)
    mask = torch.zeros(horizon, dtype=torch.float)
    infos = torch.zeros(horizon, dtype=torch.float)
    s = env.reset()
    done = False
    t = 0
    while not done and t < horizon:
        s = np.array(s, dtype=np.float)
        s = torch.tensor(s, dtype=torch.float).view(-1)
        a = policy.act(s, deterministic)
        a = torch.tensor(a, dtype=torch.float).view(-1)
        if action_filter is not None:
            a = action_filter(a)
        next_s, r, done, info = env.step(a.numpy())

        states[t] = s
        actions[t] = a
        rewards[t] = r
        mask[t] = 1
        if key is not None and key in info:
            infos[t] = info[key]

        s = next_s
        t += 1
    return states, actions, rewards, mask, infos
예제 #2
0
def adabatch(env, policy, 
            horizon,
            batchsize = 100, 
            iterations = 1000,
            gamma = 0.99,
            rmax = 1.,
            phimax = 1.,
            safety_requirement = MonotonicImprovement(),
            test_det = True,
            render = False,
            seed = None,
            baseline = 'peters',
            action_filter = None,
            parallel = False,
            n_jobs = 4,
            logger = Logger(name='test_sunday'),
            save_params = 1000,
            log_params = True,
            verbose = True):
    """
        Only for SIMPLE Gaussian policy w/ scalar variance
    """
        
    # Defaults
    assert policy.learn_std
    if action_filter is None:
        action_filter = clip(env)
    
    # Seeding agent
    if seed is not None:
        seed_all_agent(seed)
    
    # Preparing logger
    algo_info = {'Algorithm': 'ADASTEP', 
                 'Environment': str(env), 
                 'BatchSize': batchsize, 
                 'Max horizon': horizon,
                 'Iterations': iterations,
                 'gamma': gamma, 
                 'actionFilter': action_filter,
                 'rmax': rmax,
                 'phimax': phimax}
    logger.write_info({**algo_info, **policy.info()})
    log_keys = ['Perf', 'UPerf', 'AvgHorizon', 
                'Alpha', 'BatchSize', 'Exploration', 
                'ThetaGradNorm',
                'Penalty', 'Coordinate']
    if log_params:
        log_keys += ['param%d' % i for i in range(policy.num_params())]
    if test_det:
        log_keys.append('DetPerf')
    log_row = dict.fromkeys(log_keys)

    logger.open(log_row.keys())
    
    # Learning
    avol = torch.tensor(env.action_space.high - env.action_space.low).item()
    it = 0
    while(it < iterations):
        # Begin iteration
        if verbose:
            print('\nIteration ', it)
        if verbose:
            print('Params: ', policy.get_flat())
    
        # Test
        if test_det:
            omega = policy.get_scale_params()
            policy.set_scale_params(-100.)
            batch = generate_batch(env, policy, horizon, 1, action_filter)
            policy.set_scale_params(omega)
            log_row['DetPerf'] = performance(batch, gamma)
        if render:
            generate_batch(env, policy, horizon, 1, action_filter, render=True)

        omega = policy.get_scale_params()
        sigma = torch.exp(omega).item()
        batch = generate_batch(env, policy, horizon, batchsize, action_filter, parallel=parallel, n_jobs=n_jobs, seed=seed)
        grad = simple_gpomdp_estimator(batch, gamma, policy, baseline)
        theta_grad = grad[1:]
        norminf = torch.max(torch.abs(theta_grad))
        k = torch.argmax(torch.abs(theta_grad))
        penalty = rmax * phimax**2 / (1-gamma)**2 * (avol / (sigma * math.sqrt(2*math.pi)) + gamma / (2*(1-gamma)))
        alpha_star = sigma ** 2/ (2 * penalty)
        Cmax = alpha_star * norminf*2 / 2
        C = safety_requirement.next()
        alpha = alpha_star * (1 + math.sqrt(1 - C / (Cmax + 1e-12) + 1e-12))
        theta = policy.get_loc_params()
        new_theta = theta
        new_theta[k] += alpha * theta_grad[k]
        policy.set_loc_params(new_theta)

        # Log
        log_row['Coordinate'] = k.item()
        log_row['Alpha'] = alpha
        log_row['Penalty'] = penalty
        log_row['ThetaGradNorm'] = torch.norm(theta_grad).item()
        log_row['BatchSize'] = batchsize
        log_row['Exploration'] = policy.exploration()
        log_row['Alpha'] = alpha
        log_row['Perf'] = performance(batch, gamma)
        log_row['UPerf'] = performance(batch, 1.)
        log_row['AvgHorizon'] = avg_horizon(batch)
        params = policy.get_flat()
        if log_params:
            for i in range(policy.num_params()):
                log_row['param%d' % i] = params[i].item()
        logger.write_row(log_row, it)
        if save_params and it % save_params == 0:
            logger.save_params(params, it)
        
        # Next iteration
        it += 1
    
    # Final policy
    if save_params:
        logger.save_params(params, it)
예제 #3
0
def adastep(env, policy, 
            horizon,
            batchsize = 100, 
            iterations = 1000,
            gamma = 0.99,
            rmax = 1.,
            phimax = 1.,
            greedy = True,
            delta = 1.,
            test_det = True,
            render = False,
            seed = None,
            baseline = 'peters',
            action_filter = None,
            parallel = False,
            n_jobs = 4,
            logger = Logger(name='test_sunday'),
            save_params = 1000,
            log_params = True,
            verbose = True):
    """
        Only for SIMPLE Gaussian policy w/ scalar variance
        Policy must have learn_std = False, as std is META-learned
    """
        
    # Defaults
    assert policy.learn_std
    if action_filter is None:
        action_filter = clip(env)
    
    # Seeding agent
    if seed is not None:
        seed_all_agent(seed)
    
    # Preparing logger
    algo_info = {'Algorithm': 'ADASTEP', 
                 'Environment': str(env), 
                 'BatchSize': batchsize, 
                 'Max horizon': horizon,
                 'Iterations': iterations,
                 'gamma': gamma, 
                 'actionFilter': action_filter,
                 'rmax': rmax,
                 'phimax': phimax,
                 'greedy': greedy}
    logger.write_info({**algo_info, **policy.info()})
    log_keys = ['Perf', 'UPerf', 'AvgHorizon', 
                'Alpha', 'BatchSize', 'Exploration', 
                'ThetaGradNorm',
                'Penalty']
    if log_params:
        log_keys += ['param%d' % i for i in range(policy.num_params())]
    if test_det:
        log_keys.append('DetPerf')
    log_row = dict.fromkeys(log_keys)

    logger.open(log_row.keys())
    
    # Learning
    avol = torch.tensor(env.action_space.high - env.action_space.low).item()
    it = 0
    while(it < iterations):
        # Begin iteration
        if verbose:
            print('\nIteration ', it)
        if verbose:
            print('Params: ', policy.get_flat())
    
        # Test
        if test_det:
            omega = policy.get_scale_params()
            policy.set_scale_params(-100.)
            batch = generate_batch(env, policy, horizon, 1, action_filter)
            policy.set_scale_params(omega)
            log_row['DetPerf'] = performance(batch, gamma)
        if render:
            generate_batch(env, policy, horizon, 1, action_filter, render=True)

        omega = policy.get_scale_params()
        sigma = torch.exp(omega).item()
        batch = generate_batch(env, policy, horizon, batchsize, action_filter, parallel=parallel, n_jobs=n_jobs, seed=seed)        
        if delta < 1:
            grad, grad_var = simple_gpomdp_estimator(batch, gamma, policy, baseline, result='moments')
            theta_grad = grad[1:]
            theta_grad_var = grad_var[1:]
            quant = 2*sts.t.interval(1 - delta, batchsize-1,loc=0.,scale=1.)[1]
            eps = quant * torch.sqrt(theta_grad_var / batchsize + 1e-12)
            norm2 = torch.norm(torch.clamp(torch.abs(theta_grad) - eps, min=0.))
            norm1 = torch.sum(torch.abs(theta_grad) + eps)
        else:
            grad = simple_gpomdp_estimator(batch, gamma, policy, baseline)
            theta_grad = grad[1:]
            norm2 = torch.norm(theta_grad)
            norm1 = torch.sum(torch.abs(theta_grad))
        penalty = rmax * phimax**2 / (1-gamma)**2 * (avol / (sigma * math.sqrt(2*math.pi)) + gamma / (2*(1-gamma)))
        alpha_star = sigma ** 2 * norm2 ** 2 / (2 * penalty * norm1 ** 2 + 1e-12)
        Cmax = alpha_star * norm2**2 / 2
            
        if greedy:
            C = Cmax
        else:
            C = 0
        alpha = alpha_star * (1 + math.sqrt(1 - C / (Cmax + 1e-12) + 1e-12))
        theta = policy.get_loc_params()
        new_theta = theta + alpha * theta_grad
        policy.set_loc_params(new_theta)

        # Log
        log_row['Alpha'] = alpha
        log_row['Penalty'] = penalty
        log_row['ThetaGradNorm'] = torch.norm(theta_grad).item()
        log_row['BatchSize'] = batchsize
        log_row['Exploration'] = policy.exploration()
        log_row['Alpha'] = alpha.item()
        log_row['Perf'] = performance(batch, gamma)
        log_row['UPerf'] = performance(batch, 1.)
        log_row['AvgHorizon'] = avg_horizon(batch)
        params = policy.get_flat()
        if log_params:
            for i in range(policy.num_params()):
                log_row['param%d' % i] = params[i].item()
        logger.write_row(log_row, it)
        if save_params and it % save_params == 0:
            logger.save_params(params, it)
        
        # Next iteration
        it += 1
    
    # Final policy
    if save_params:
        logger.save_params(params, it)
예제 #4
0
def sepg(env, policy, 
            horizon,
            batchsize = 100, 
            iterations = 1000,
            gamma = 0.99,
            rmax = 1.,
            phimax = 1.,
            safety_requirement = 'mi',
            delta = 1.,
            confidence_schedule = None,
            clip_at = 100,
            test_batchsize = False,
            render = False,
            seed = None,
            baseline = 'peters',
            shallow = True,
            action_filter = None,
            parallel = False,
            logger = Logger(name='SEPG'),
            save_params = 1000,
            log_params = True,
            verbose = True):
    """
        Only for SIMPLE Gaussian policy w/ scalar variance
        Policy must have learn_std = False, as std is META-learned
    """
        
    #Defaults
    assert policy.learn_std
    if action_filter is None:
        action_filter = clip(env)
    
    #Seed agent
    if seed is not None:
        seed_all_agent(seed)
    
    #Prepare logger
    algo_info = {'Algorithm': 'SEPG', 
                 'Environment': str(env), 
                 'BatchSize': batchsize, 
                 'Max horizon': horizon,
                 'Iterations': iterations,
                 'gamma': gamma, 
                 'actionFilter': action_filter,
                 'rmax': rmax,
                 'phimax': phimax}
    logger.write_info({**algo_info, **policy.info()})
    log_keys = ['Perf', 'UPerf', 'AvgHorizon', 
                'Alpha', 'BatchSize', 'Exploration', 'Eta', 
                'ThetaGradNorm', 'OmegaGrad', 'OmegaMetagrad',
                'Penalty', 'MetaPenalty',
                'IterationKind',
                'ThetaGradNorm', 'Eps', 'Up', 'Down', 'C', 'Cmax', 'Delta'] #0: theta, 1: omega
    if log_params:
        log_keys += ['param%d' % i for i in range(policy.num_params())]
    if test_batchsize:
        log_keys.append('DetPerf')
    log_row = dict.fromkeys(log_keys)

    logger.open(log_row.keys())
    
    #Safety requirements
    if safety_requirement == 'mi':
        thresholder = MonotonicImprovement()
    elif safety_requirement == 'budget':
        batch = generate_batch(env, policy, horizon, batchsize, action_filter)
        thresholder = Budget(performance(batch, gamma))
    else:
        thresholder = FixedThreshold(float(safety_requirement))
    
    #Learning loop
    omega_grad = float('nan')
    omega_metagrad  = float('nan')
    metapenalty  = float('nan')
    eta = float('nan')
    it = 0
    while(it < iterations):
        #Begin iteration
        if verbose:
            print('\nIteration ', it)
        if verbose:
            print('Params: ', policy.get_flat())
    
        #Test mean parameters on deterministic policy
        if test_batchsize:
            test_batch = generate_batch(env, policy, horizon, test_batchsize, 
                                        action_filter=action_filter,
                                        seed=seed,
                                        njobs=parallel,
                                        deterministic=True)
            log_row['DetPerf'] = performance(test_batch, gamma)
        #Render behavior
        if render:
            generate_batch(env, policy, horizon, 1, action_filter, render=True)

        #
        if it % 2 == 0:
            #Std update
            omega = policy.get_scale_params()
            sigma = torch.exp(omega).item()
            batch = generate_batch(env, policy, horizon, batchsize, 
                                   action_filter=action_filter, 
                                   njobs=parallel,
                                   seed=seed)
            if confidence_schedule is not None:
                delta = confidence_schedule.next(it)
            log_row['Delta'] = delta
            if delta <1:
                grad, grad_var = simple_gpomdp_estimator(batch, gamma, policy, baseline, result='moments')
                omega_grad = grad[0]
                omega_grad_var = grad_var[0]
                omega_metagrad, omega_metagrad_var = metagrad(batch, gamma, policy, alpha, clip_at, baseline, result='moments')
                quant = 2 * sts.t.interval(1 - delta, batchsize-1,loc=0.,scale=1.)[1]
                eps = torch.tensor(quant * torch.sqrt(omega_grad_var / batchsize), dtype=torch.float)
                log_row['Eps'] = torch.norm(eps).item()
                metaeps = torch.tensor(quant * torch.sqrt(omega_metagrad_var / batchsize), dtype=torch.float)
                if torch.sign(omega_grad).item() >= 0 and torch.sign(omega_metagrad).item() >= 0:
                    up = torch.clamp(torch.abs(omega_grad - eps), min=0.) * torch.clamp(torch.abs(omega_metagrad - metaeps), min=0.)
                elif torch.sign(omega_grad).item() >= 0 and torch.sign(omega_metagrad).item() < 0:
                    up = (omega_grad + eps) * (omega_metagrad - metaeps)
                elif torch.sign(omega_grad).item() < 0 and torch.sign(omega_metagrad).item() >=0:
                    up = (omega_grad - eps) * (omega_metagrad + eps)
                else:
                    up = torch.abs(omega_grad + eps) * torch.abs(omega_metagrad + metaeps)
                down = omega_metagrad + metaeps * torch.sign(omega_metagrad)
                log_row['Up'] = up.item()
                log_row['Down'] = down.item()
                metapenalty = rmax /  (1 - gamma)**2 * (0.53 * avol / (2 * sigma) + gamma / (1 - gamma))
                eta_star = (up / (2 * metapenalty * down**2 + 1e-12)).item()
                Cmax = up**2 / (4 * metapenalty * down**2).item()
            else:
                log_row['Eps'] = 0
                grad = gpomdp_estimator(batch, gamma, policy, 
                                        baselinekind=baseline, 
                                        shallow=shallow)
                theta_grad = grad[1:]
                omega_grad = grad[0]
                #->
                mixed, _ = mixed_estimator(batch, gamma, policy, baseline, theta_grad)
                norm_grad = 2 * theta_grad.dot(mixed)
                A = omega_grad
                B = 2 * alpha * torch.norm(theta_grad)**2
                C = sigma * alpha * norm_grad
                C = torch.clamp(C, min=-clip_at, max=clip_at)
                omega_metagrad = A + B + C
                metapenalty = rmax /  (1 - gamma)**2 * (0.53 * avol / (2 * sigma) + gamma / (1 - gamma))
                eta_star = (omega_grad / (2 * metapenalty * omega_metagrad) + 1e-12).item()
                Cmax = (omega_grad ** 2 / (4 * metapenalty)).item()
                log_row['Up'] = torch.tensor(omega_grad).item()
                log_row['Down'] = torch.tensor(omega_metagrad).item()
        
            perf = performance(batch, gamma)
            Co = thresholder.next(perf)
            Co = min(Co, Cmax)
            log_row['C'] = Co
            log_row['Cmax'] = Cmax
            eta = eta_star + abs(eta_star) * math.sqrt(1 - Co / (Cmax + 1e-12) + 1e-12)
            new_omega = omega + eta * omega_metagrad
            policy.set_scale_params(new_omega)
            ###
        else:
            #Mean update
            omega = policy.get_scale_params()
            sigma = torch.exp(omega).item()
            batch = generate_batch(env, policy, horizon, batchsize, 
                                   action_filter=action_filter, 
                                   n_jobs=parallel, 
                                   seed=seed)
            if confidence_schedule is not None:
                delta = confidence_schedule.next(it)
            log_row['Delta'] = delta
            if delta < 1:
                grad, grad_var = simple_gpomdp_estimator(batch, gamma, policy, baseline, result='moments')
                theta_grad = grad[1:]
                theta_grad_var = grad_var[1:]
                quant = 2*sts.t.interval(1 - delta, batchsize-1,loc=0.,scale=1.)[1]
                eps = quant * torch.sqrt(theta_grad_var / batchsize)
                log_row['Eps'] = torch.norm(eps).item()
                norm2 = torch.norm(torch.clamp(torch.abs(theta_grad) - eps, min=0.))
                norm1 = torch.sum(torch.abs(theta_grad) + eps)
                log_row['Up'] = norm1.item()
                log_row['Down'] = norm2.item()
            else:
                log_row['Eps'] = 0
                grad = simple_gpomdp_estimator(batch, gamma, policy, baseline)
                theta_grad = grad[1:]
                norm2 = torch.norm(theta_grad)
                norm1 = torch.sum(torch.abs(theta_grad))
                log_row['Up'] = norm1.item()
                log_row['Down'] = norm2.item()
            penalty = rmax * phimax**2 / (1-gamma)**2 * (avol / (sigma * math.sqrt(2*math.pi)) + gamma / (2*(1-gamma)))
            alpha_star = sigma ** 2 * norm2 ** 2 / (2 * penalty * norm1 ** 2 + 1e-12)
            Cmax = (alpha_star * norm2**2 / 2).item()
            perf = performance(batch, gamma)
            Co = thresholder.next(perf)
            Co = min(Co, Cmax)
            log_row['C'] = Co
            log_row['Cmax'] = Cmax
            alpha = alpha_star * (1 + math.sqrt(1 - Co / (Cmax + 1e-12) + 1e-12))
            theta = policy.get_loc_params()
            new_theta = theta + alpha * theta_grad
            policy.set_loc_params(new_theta)
            ###


        # Log
        log_row['IterationKind'] = it % 2
        log_row['ThetaGradNorm'] = torch.norm(theta_grad).item()
        log_row['Alpha'] = alpha
        log_row['Eta'] = eta
        log_row['Penalty'] = penalty
        log_row['MetaPenalty'] = metapenalty
        log_row['OmegaGrad'] = torch.tensor(omega_grad).item()
        log_row['OmegaMetagrad'] = torch.tensor(omega_metagrad).item()
        log_row['ThetaGradNorm'] = torch.norm(theta_grad).item()
        log_row['BatchSize'] = batchsize
        log_row['Exploration'] = policy.exploration()
        log_row['Alpha'] = alpha.item()
        log_row['Perf'] = perf
        log_row['UPerf'] = performance(batch, 1.)
        log_row['AvgHorizon'] = avg_horizon(batch)
        params = policy.get_flat()
        if log_params:
            for i in range(policy.num_params()):
                log_row['param%d' % i] = params[i].item()
        logger.write_row(log_row, it)
        if save_params and it % save_params == 0:
            logger.save_params(params, it)
        
        # Next iteration
        it += 1
    
    # Final policy
    if save_params:
        logger.save_params(params, it)
예제 #5
0
def reinforce2(alpha, logsig, env, policy, horizon, *,
              batchsize=100,
              iterations=1000,
              disc=0.99,
              stepper=ConstantStepper(1e-2),
              action_filter=None,
              estimator='gpomdp',
              baseline='avg',
              logger=Logger(name='gpomdp'),
              shallow=False,
              seed=None,
              test_batchsize=False,
              info_key='danger',
              save_params=100,
              log_params=False,
              log_grad=False,
              parallel=False,
              render=False,
              verbose=1):
    """
    REINFORCE/G(PO)MDP algorithmn

    env: environment
    policy: the one to improve
    horizon: maximum task horizon
    batchsize: number of trajectories used to estimate policy gradient
    iterations: number of policy updates
    disc: discount factor
    stepper: step size criterion. A constant step size is used by default
    action_filter: function to apply to the agent's action before feeding it to
        the environment, not considered in gradient estimation. By default,
        the action is clipped to satisfy evironmental boundaries
    estimator: either 'reinforce' or 'gpomdp' (default). The latter typically
        suffers from less variance
    baseline: control variate to be used in the gradient estimator. Either
        'avg' (average reward, default), 'peters' (variance-minimizing) or
        'zero' (no baseline)
    logger: for human-readable logs (standard output, csv, tensorboard...)
    shallow: whether to employ pre-computed score functions (only available for
        shallow policies)
    seed: random seed (None for random behavior)
    test_batchsize: number of test trajectories used to evaluate the
        corresponding deterministic policy at each iteration. If 0 or False, no
        test is performed
    save_params: how often (every x iterations) to save the policy
        parameters to disk. Final parameters are always saved for
        x>0. If False, they are never saved.
    log_params: whether to include policy parameters in the human-readable logs
    log_grad: whether to include gradients in the human-readable logs
    parallel: number of parallel jobs for simulation. If 0 or False,
        sequential simulation is performed.
    render: how often (every x iterations) to render the agent's behavior
        on a sample trajectory. If False, no rendering happens
    verbose: level of verbosity (0: only logs; 1: normal; 2: maximum)
    """

    # Defaults
    if action_filter is None:
        action_filter = clip(env)

    # Seed agent
    if seed is not None:
        seed_all_agent(seed)

    # Prepare logger
    algo_info = {'Algorithm': 'REINFORCE',
                 'Estimator': estimator,
                 'Baseline': baseline,
                 'Env': str(env),
                 'Horizon': horizon,
                 'BatchSize': batchsize,
                 'Disc': disc,
                 'StepSizeCriterion': str(stepper),
                 'Seed': seed,
                 }
    logger.write_info({**algo_info, **policy.info()})
    log_keys = ['Perf',
                'UPerf',
                'AvgHorizon',
                'StepSize',
                'GradNorm',
                'Time',
                'StepSize',
                'Exploration',
                'Info']
    if log_params:
        log_keys += ['param%d' % i for i in range(policy.num_params())]
    if log_grad:
        log_keys += ['grad%d' % i for i in range(policy.num_params())]
    if test_batchsize:
        log_keys += ['TestPerf', 'TestPerf', 'TestInfo']
    log_row = dict.fromkeys(log_keys)
    logger.open(log_row.keys())

    # init image & csv
    filename = "../csv/minigolf/REINFORCE/ALPHA={}/LOGSTD={}/data{}.csv".format(alpha, logsig, seed)
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    data_file = open(filename, mode='w')
    file_writer = csv.writer(data_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

    visualizer = MGVisualizer("MG visualizer", "/minigolf/REINFORCE/ALPHA={}/LOGSTD={}/test{}.png".format(alpha, logsig,
                                                                                                          seed))
    visualizer.clean_panels()

    # PLOTTER INFO
    stats = {}
    stats['w1'] = []
    stats['w2'] = []
    stats['w3'] = []
    stats['w4'] = []
    stats['j'] = []
    stats['fail'] = []
    # ------------

    # Learning loop
    it = 0
    cumulative_fail = 0
    cumulative_j = 0
    while it < iterations:
        # Begin iteration
        start = time.time()
        if verbose:
            print('\nIteration ', it)
        params = policy.get_flat()
        if verbose > 1:
            print('Parameters:', params)

        # Test the corresponding deterministic policy
        if test_batchsize:
            test_batch = generate_batch(env, policy, horizon, test_batchsize,
                                        action_filter=action_filter,
                                        seed=seed,
                                        njobs=parallel,
                                        deterministic=True,
                                        key=info_key)
            log_row['TestPerf'] = performance(test_batch, disc)
            log_row['TestInfo'] = mean_sum_info(test_batch).item()
            log_row['UTestPerf'] = performance(test_batch, 1)

        # Render the agent's behavior
        if render and it % render == 0:
            generate_batch(env, policy, horizon,
                           episodes=1,
                           action_filter=action_filter,
                           render=True,
                           key=info_key)

        # Collect trajectories
        batch = generate_batch(env, policy, horizon, batchsize,
                               action_filter=action_filter,
                               seed=seed,
                               n_jobs=parallel,
                               key=info_key)

        # ------------------- count fails -------------------
        rewards = [b[2] for b in batch]
        failures = [np.count_nonzero(r==-100) for r in rewards]
        cumulative_fail += sum(failures)
        # ---------------------------------------------------

        perf = performance(batch, disc)
        cumulative_j += perf
        log_row['Perf'] = perf
        log_row['Info'] = mean_sum_info(batch).item()
        log_row['UPerf'] = performance(batch, disc=1.)
        log_row['AvgHorizon'] = avg_horizon(batch)
        log_row['Exploration'] = policy.exploration().item()
        log_row['IterationFails'] = sum(failures)
        log_row['CumulativeFails'] = cumulative_fail

        # Estimate policy gradient
        if estimator == 'gpomdp':
            grad = gpomdp_estimator(batch, disc, policy,
                                    baselinekind=baseline,
                                    shallow=shallow)
        elif estimator == 'reinforce':
            grad = reinforce_estimator(batch, disc, policy,
                                       baselinekind=baseline,
                                       shallow=shallow)
        else:
            raise ValueError('Invalid policy gradient estimator')
        if verbose > 1:
            print('Gradients: ', grad)
        log_row['GradNorm'] = torch.norm(grad).item()

        # Select meta-parameters
        stepsize = stepper.next(grad)
        log_row['StepSize'] = torch.norm(torch.tensor(stepsize)).item()

        # Update policy parameters
        new_params = params + stepsize * grad
        policy.set_from_flat(new_params)

        # Log
        log_row['Time'] = time.time() - start
        if log_params:
            for i in range(policy.num_params()):
                log_row['param%d' % i] = params[i].item()
        if log_grad:
            for i in range(policy.num_params()):
                log_row['grad%d' % i] = grad[i].item()
        logger.write_row(log_row, it)

        # Save parameters
        if save_params and it % save_params == 0:
            logger.save_params(params, it)

        print(new_params)
        params = new_params.numpy()[1:]  # updated w

        # update csv & image
        visualizer.show_values(params, perf, cumulative_fail)
        file_writer.writerow([params[0], params[1], params[2], params[3], cumulative_fail, perf])

        # PLOTTER INFO
        # if it % 10 == 0:
        stats['w1'].append(params[0])
        stats['w2'].append(params[1])
        stats['w3'].append(params[2])
        stats['w4'].append(params[3])
        stats['j'].append(perf)
        stats['fail'].append(cumulative_fail)
        # ------------

        # Next iteration
        it += 1

    # Save final parameters
    if save_params:
        logger.save_params(params, it)

    visualizer.save_image()
    # Cleanup
    logger.close()
    return stats, cumulative_j
예제 #6
0
def reinforce(env,
              policy,
              horizon,
              *,
              batchsize=100,
              iterations=1000,
              disc=0.99,
              stepper=ConstantStepper(1e-2),
              action_filter=None,
              estimator='gpomdp',
              baseline='avg',
              logger=Logger(name='gpomdp'),
              shallow=False,
              seed=None,
              test_batchsize=False,
              info_key='danger',
              save_params=100,
              log_params=False,
              log_grad=False,
              parallel=False,
              render=False,
              verbose=1):
    """
    REINFORCE/G(PO)MDP algorithmn
        
    env: environment
    policy: the one to improve
    horizon: maximum task horizon
    batchsize: number of trajectories used to estimate policy gradient
    iterations: number of policy updates
    disc: discount factor
    stepper: step size criterion. A constant step size is used by default
    action_filter: function to apply to the agent's action before feeding it to 
        the environment, not considered in gradient estimation. By default,
        the action is clipped to satisfy evironmental boundaries
    estimator: either 'reinforce' or 'gpomdp' (default). The latter typically
        suffers from less variance
    baseline: control variate to be used in the gradient estimator. Either
        'avg' (average reward, default), 'peters' (variance-minimizing) or
        'zero' (no baseline)
    logger: for human-readable logs (standard output, csv, tensorboard...)
    shallow: whether to employ pre-computed score functions (only available for
        shallow policies)
    seed: random seed (None for random behavior)
    test_batchsize: number of test trajectories used to evaluate the 
        corresponding deterministic policy at each iteration. If 0 or False, no 
        test is performed
    save_params: how often (every x iterations) to save the policy 
        parameters to disk. Final parameters are always saved for 
        x>0. If False, they are never saved.
    log_params: whether to include policy parameters in the human-readable logs
    log_grad: whether to include gradients in the human-readable logs
    parallel: number of parallel jobs for simulation. If 0 or False, 
        sequential simulation is performed.
    render: how often (every x iterations) to render the agent's behavior
        on a sample trajectory. If False, no rendering happens
    verbose: level of verbosity (0: only logs; 1: normal; 2: maximum)
    """
    #Defaults
    if action_filter is None:
        action_filter = clip(env)

    #Seed agent
    if seed is not None:
        seed_all_agent(seed)

    #Prepare logger
    algo_info = {
        'Algorithm': 'REINFORCE',
        'Estimator': estimator,
        'Baseline': baseline,
        'Env': str(env),
        'Horizon': horizon,
        'BatchSize': batchsize,
        'Disc': disc,
        'StepSizeCriterion': str(stepper),
        'Seed': seed,
    }
    logger.write_info({**algo_info, **policy.info()})
    log_keys = [
        'Perf', 'UPerf', 'AvgHorizon', 'StepSize', 'GradNorm', 'Time',
        'StepSize', 'Exploration', 'Info'
    ]
    if log_params:
        log_keys += ['param%d' % i for i in range(policy.num_params())]
    if log_grad:
        log_keys += ['grad%d' % i for i in range(policy.num_params())]
    if test_batchsize:
        log_keys += ['TestPerf', 'TestPerf', 'TestInfo']
    log_row = dict.fromkeys(log_keys)
    logger.open(log_row.keys())

    #Learning loop
    it = 0
    while (it < iterations):
        #Begin iteration
        start = time.time()
        if verbose:
            print('\nIteration ', it)
        params = policy.get_flat()
        if verbose > 1:
            print('Parameters:', params)

        #Test the corresponding deterministic policy
        if test_batchsize:
            test_batch = generate_batch(env,
                                        policy,
                                        horizon,
                                        test_batchsize,
                                        action_filter=action_filter,
                                        seed=seed,
                                        njobs=parallel,
                                        deterministic=True,
                                        key=info_key)
            log_row['TestPerf'] = performance(test_batch, disc)
            log_row['TestInfo'] = mean_sum_info(test_batch).item()
            log_row['UTestPerf'] = performance(test_batch, 1)

        #Render the agent's behavior
        if render and it % render == 0:
            generate_batch(env,
                           policy,
                           horizon,
                           episodes=1,
                           action_filter=action_filter,
                           render=True,
                           key=info_key)

        #Collect trajectories
        batch = generate_batch(env,
                               policy,
                               horizon,
                               batchsize,
                               action_filter=action_filter,
                               seed=seed,
                               n_jobs=parallel,
                               key=info_key)
        log_row['Perf'] = performance(batch, disc)
        log_row['Info'] = mean_sum_info(batch).item()
        log_row['UPerf'] = performance(batch, disc=1.)
        log_row['AvgHorizon'] = avg_horizon(batch)
        log_row['Exploration'] = policy.exploration().item()

        #Estimate policy gradient
        if estimator == 'gpomdp':
            grad = gpomdp_estimator(batch,
                                    disc,
                                    policy,
                                    baselinekind=baseline,
                                    shallow=shallow)
        elif estimator == 'reinforce':
            grad = reinforce_estimator(batch,
                                       disc,
                                       policy,
                                       baselinekind=baseline,
                                       shallow=shallow)
        else:
            raise ValueError('Invalid policy gradient estimator')
        if verbose > 1:
            print('Gradients: ', grad)
        log_row['GradNorm'] = torch.norm(grad).item()

        #Select meta-parameters
        stepsize = stepper.next(grad)
        log_row['StepSize'] = torch.norm(torch.tensor(stepsize)).item()

        #Update policy parameters
        new_params = params + stepsize * grad
        policy.set_from_flat(new_params)

        #Log
        log_row['Time'] = time.time() - start
        if log_params:
            for i in range(policy.num_params()):
                log_row['param%d' % i] = params[i].item()
        if log_grad:
            for i in range(policy.num_params()):
                log_row['grad%d' % i] = grad[i].item()
        logger.write_row(log_row, it)

        #Save parameters
        if save_params and it % save_params == 0:
            logger.save_params(params, it)

        #Next iteration
        it += 1

    #Save final parameters
    if save_params:
        logger.save_params(params, it)

    #Cleanup
    logger.close()
예제 #7
0
def adabatch(env,
             policy,
             horizon,
             pen_coeff,
             *,
             bound='chebyshev',
             var_bound=None,
             grad_range=None,
             conf=0.2,
             min_batchsize=32,
             max_batchsize=10000,
             iterations=float('inf'),
             max_samples=1e6,
             disc=0.9,
             action_filter=None,
             estimator='gpomdp',
             baseline='peters',
             logger=Logger(name='AdaBatch'),
             shallow=True,
             meta_conf=0.05,
             seed=None,
             test_batchsize=False,
             info_key='danger',
             save_params=100,
             log_params=True,
             log_grad=False,
             parallel=False,
             render=False,
             verbose=1):
    """
    Safe PG algorithm from "Adaptive Batch Size for Safe Policy Gradients",
                        Papini et al., 2017.
    Only for Gaussian policies.
        
    env: environment
    policy: the one to improve
    horizon: maximum task horizon
    pen_coeff: penalty coefficient for policy update
    bound: statistical inequality used to determine optimal batchsize 
        (chebyshev/student/hoeffding/bernstein)
    var_bound: upper bound on the variance of the PG estimator. Must not be 
        None if Chebyshev's bound is employed
    grad_range: theoretical range of gradient estimate. If none, it is 
        estimated from data (in a biased way)
    conf: probability of failure
    min_batchsize: minimum number of trajectories to estimate policy gradient
    max_batchsize: maximum number of trajectories to estimate policy gradient
    iterations: number of policy updates
    max_samples: maximum number of total trajectories
    disc: discount factor
    action_filter: function to apply to the agent's action before feeding it to 
        the environment, not considered in gradient estimation. By default,
        the action is clipped to satisfy evironmental boundaries
    estimator: either 'reinforce' or 'gpomdp' (default). The latter typically
        suffers from less variance
    baseline: control variate to be used in the gradient estimator. Either
        'avg' (average reward, default), 'peters' (variance-minimizing) or
        'zero' (no baseline)
    logger: for human-readable logs (standard output, csv, tensorboard...)
    shallow: whether to employ pre-computed score functions (only available for
        shallow policies)
    meta_conf: confidence level of safe-update test (for evaluation only)
    seed: random seed (None for random behavior)
    test_batchsize: number of test trajectories used to evaluate the 
        corresponding deterministic policy at each iteration. If 0 or False, no 
        test is performed
    info_key: name of the environment info to log
    save_params: how often (every x iterations) to save the policy 
        parameters to disk. Final parameters are always saved for 
        x>0. If False, they are never saved.
    log_params: whether to include policy parameters in the human-readable logs
    log_grad: whether to include gradients in the human-readable logs
    parallel: number of parallel jobs for simulation. If 0 or False, 
        sequential simulation is performed.
    render: how often (every x iterations) to render the agent's behavior
        on a sample trajectory. If False, no rendering happens
    verbose: level of verbosity
    """
    #Defaults
    if action_filter is None:
        action_filter = clip(env)
    if bound == 'chebyshev' and var_bound is None:
        raise NotImplementedError
    empirical_range = (grad_range is None)

    #Seed agent
    if seed is not None:
        seed_all_agent(seed)

    #Prepare logger
    algo_info = {
        'Algorithm': 'AdaBatch',
        'Estimator': estimator,
        'Baseline': baseline,
        'Env': str(env),
        'Horizon': horizon,
        'Discount': disc,
        'Confidence': conf,
        'ConfidenceParam': conf,
        'Seed': seed,
        'MinBatchSize': min_batchsize,
        'MaxBatchSize': max_batchsize,
        'PenalizationCoefficient': pen_coeff,
        'VarianceBound': var_bound,
        'Bound': bound
    }
    logger.write_info({**algo_info, **policy.info()})
    log_keys = [
        'Perf', 'UPerf', 'AvgHorizon', 'StepSize', 'GradNorm', 'Time',
        'StepSize', 'BatchSize', 'Info', 'TotSamples', 'GradVar', 'GradRange',
        'Safety', 'Err', 'GradInfNorm'
    ]
    if log_params:
        log_keys += ['param%d' % i for i in range(policy.num_params())]
    if log_grad:
        log_keys += ['grad%d' % i for i in range(policy.num_params())]
    if test_batchsize:
        log_keys += ['TestPerf', 'TestPerf', 'TestInfo']
    log_row = dict.fromkeys(log_keys)
    logger.open(log_row.keys())

    #Initializations
    it = 0
    tot_samples = 0
    safety = 1.
    optimal_batchsize = min_batchsize
    _estimator = (reinforce_estimator
                  if estimator == 'reinforce' else gpomdp_estimator)
    updated = False
    updates = 0
    unsafe_updates = 0
    params = policy.get_flat()
    max_grad = torch.zeros_like(params) - float('inf')
    min_grad = torch.zeros_like(params) + float('inf')

    #Learning loop
    while (it < iterations and tot_samples < max_samples):
        start = time.time()
        if verbose:
            print('\n* Iteration %d *' % it)
        params = policy.get_flat()

        #Test the corresponding deterministic policy
        if test_batchsize:
            test_batch = generate_batch(env,
                                        policy,
                                        horizon,
                                        episodes=test_batchsize,
                                        action_filter=action_filter,
                                        n_jobs=parallel,
                                        deterministic=True,
                                        key=info_key)
            log_row['TestPerf'] = performance(test_batch, disc)
            log_row['UTestPerf'] = performance(test_batch, 1)
            log_row['TestInfo'] = mean_sum_info(test_batch).item()

        #Render the agent's behavior
        if render and it % render == 0:
            generate_batch(env,
                           policy,
                           horizon,
                           episodes=1,
                           action_filter=action_filter,
                           render=True)

        #Collect trajectories according to previous optimal batch size
        batch = generate_batch(env,
                               policy,
                               horizon,
                               episodes=max(
                                   min_batchsize,
                                   min(max_batchsize, optimal_batchsize)),
                               action_filter=action_filter,
                               n_jobs=parallel,
                               key=info_key)
        batchsize = len(batch)

        #Estimate policy gradient
        grad_samples = _estimator(batch,
                                  disc,
                                  policy,
                                  baselinekind=baseline,
                                  shallow=shallow,
                                  result='samples')
        grad = torch.mean(grad_samples, 0)
        grad_infnorm = torch.max(torch.abs(grad))
        coordinate = torch.min(torch.argmax(torch.abs(grad))).item()

        #Compute statistics for estimation error
        if bound in ['bernstein', 'student']:
            grad_var = torch.var(grad_samples, 0, unbiased=True)
            grad_var = torch.max(grad_var).item()
            log_row['GradVar'] = grad_var
        else:
            log_row['GradVar'] = var_bound
        if bound in ['bernstein', 'hoeffding'] and empirical_range:
            max_grad = torch.max(grad, max_grad)
            min_grad = torch.min(min_grad, grad)
            grad_range = torch.max(max_grad - min_grad).item()
            if grad_range <= 0:
                grad_range = torch.max(2 * abs(max_grad)).item()
        log_row['GradRange'] = grad_range

        #Compute estimation error
        if bound == 'chebyshev':
            eps = math.sqrt(var_bound / conf)
        elif bound == 'student':
            quant = sts.t.ppf(1 - conf, batchsize)
            eps = quant * math.sqrt(grad_var)
        elif bound == 'hoeffding':
            eps = grad_range * math.sqrt(math.log(2. / conf) / 2)
        elif bound == 'bernstein':
            eps = math.sqrt(2 * grad_var * math.log(3. / conf))
            eps2 = 3 * grad_range * math.log(3. / conf)

        #Compute optimal batch size
        if bound in ['chebyshev', 'student', 'hoeffding']:
            optimal_batchsize = math.ceil(((13 + 3 * math.sqrt(17)) * eps**2 /
                                           (2 * grad_infnorm**2)).item())
            min_safe_batchsize = math.ceil((eps**2 / grad_infnorm**2).item())
        else:
            min_safe_batchsize = math.ceil(
                ((eps + math.sqrt(eps**2 + 4 * eps2 * grad_infnorm)) /
                 (2 * grad_infnorm))**2)
            optimal_batchsize = min_safe_batchsize
            _stepsize = ((grad_infnorm - eps / math.sqrt(optimal_batchsize) -
                          eps2 / optimal_batchsize)**2 /
                         (2 * pen_coeff *
                          (grad_infnorm + eps / math.sqrt(optimal_batchsize) +
                           eps2 / optimal_batchsize)**2)).item()
            ups = (grad_infnorm**2 * _stepsize * (1 - pen_coeff * _stepsize) /
                   optimal_batchsize)
            old_ups = -float('inf')
            while ups > old_ups:
                optimal_batchsize += 1
                old_ups = ups
                _stepsize = (
                    (grad_infnorm - eps / math.sqrt(optimal_batchsize) -
                     eps2 / optimal_batchsize)**2 /
                    (2 * pen_coeff *
                     (grad_infnorm + eps / math.sqrt(optimal_batchsize) +
                      eps2 / optimal_batchsize)**2)).item()
                ups = (grad_infnorm**2 * _stepsize *
                       (1 - pen_coeff * _stepsize) / optimal_batchsize)
            optimal_batchsize -= 1

        if verbose:
            print('Optimal batch size: %d' % optimal_batchsize)

        #Update long-term quantities
        tot_samples += batchsize

        #Update safety measure
        if updates == 0:
            old_rets = returns(batch, disc)
        elif updated:
            new_rets = returns(batch, disc)
            tscore, pval = sts.ttest_ind(old_rets, new_rets)
            if pval / 2 < meta_conf and tscore > 0:
                unsafe_updates += 1
                if verbose:
                    print('The previous update was unsafe! (p-value = %f)' %
                          (pval / 2))
            old_rets = new_rets
            safety = 1 - unsafe_updates / updates

        #Log
        log_row['Err'] = eps
        log_row['Safety'] = safety
        log_row['Perf'] = performance(batch, disc)
        log_row['Info'] = mean_sum_info(batch).item()
        log_row['UPerf'] = performance(batch, disc=1.)
        log_row['AvgHorizon'] = avg_horizon(batch)
        log_row['GradNorm'] = torch.norm(grad).item()
        log_row['GradInfNorm'] = grad_infnorm.item()
        log_row['BatchSize'] = batchsize
        log_row['TotSamples'] = tot_samples
        if log_params:
            for i in range(policy.num_params()):
                log_row['param%d' % i] = params[i].item()
        if log_grad:
            for i in range(policy.num_params()):
                log_row['grad%d' % i] = grad[i].item()

        #Check if number of samples is sufficient to perform update
        if grad_infnorm < eps / math.sqrt(batchsize):
            updated = False
            if verbose:
                print('No update, need more samples')

            #Log
            log_row['StepSize'] = 0.
            log_row['Time'] = time.time() - start
            if verbose:
                print(separator)
            logger.write_row(log_row, it)
            if verbose:
                print(separator)

            #Skip to next iteration (current trajectories are discarded)
            it += 1
            continue

        #Select step size
        if bound == 'bernstein':
            stepsize = ((grad_infnorm - eps / math.sqrt(batchsize) -
                         eps2 / batchsize)**2 /
                        (2 * pen_coeff *
                         (grad_infnorm + eps / math.sqrt(batchsize) +
                          eps2 / batchsize)**2)).item()
        else:
            stepsize = (13 - 3 * math.sqrt(17)) / (4 * pen_coeff)
        log_row['StepSize'] = stepsize

        #Update policy parameters
        new_params = params
        new_params[coordinate] = (params[coordinate] +
                                  stepsize * grad[coordinate])
        policy.set_from_flat(new_params)
        updated = True
        updates += 1

        #Save parameters
        if save_params and it % save_params == 0:
            logger.save_params(params, it)

        #Next iteration
        log_row['Time'] = time.time() - start
        if verbose:
            print(separator)
        logger.write_row(log_row, it)
        if verbose:
            print(separator)
        it += 1

    #Save final parameters
    if save_params:
        logger.save_params(params, it)

    #Cleanup
    logger.close()
예제 #8
0
def adastep(env,
            policy,
            horizon,
            pen_coeff,
            var_bound,
            *,
            conf=0.2,
            batchsize=5000,
            iterations=float('inf'),
            max_samples=1e6,
            disc=0.9,
            action_filter=None,
            estimator='gpomdp',
            baseline='peters',
            logger=Logger(name='AdaStep'),
            shallow=True,
            meta_conf=0.05,
            seed=None,
            test_batchsize=False,
            info_key='danger',
            save_params=100,
            log_params=True,
            log_grad=False,
            parallel=False,
            render=False,
            verbose=1):
    """
    Safe PG algorithm from "Adaptive Step Size for Policy Gradient Methods", 
                        Pirotta et al., 2013.
    Only for Gaussian policies.
        
    env: environment
    policy: the one to improve
    horizon: maximum task horizon
    pen_coeff: penalty coefficient for policy update
    var_bound: upper bound on the variance of the PG estimator
    conf: probability of failure
    batchsize: number of trajectories to estimate policy gradient
    iterations: maximum number of learning iterations
    max_samples: maximum number of total trajectories
    disc: discount factor
    action_filter: function to apply to the agent's action before feeding it to 
        the environment, not considered in gradient estimation. By default,
        the action is clipped to satisfy evironmental boundaries
    estimator: either 'reinforce' or 'gpomdp' (default). The latter typically
        suffers from less variance
    baseline: control variate to be used in the gradient estimator. Either
        'avg' (average reward, default), 'peters' (variance-minimizing) or
        'zero' (no baseline)
    logger: for human-readable logs (standard output, csv, tensorboard...)
    shallow: whether to employ pre-computed score functions (only available for
        shallow policies)
    meta_conf: confidence level of safe-update test (for evaluation only)
    seed: random seed (None for random behavior)
    test_batchsize: number of test trajectories used to evaluate the 
        corresponding deterministic policy at each iteration. If 0 or False, no 
        test is performed
    info_key: name of the environment info to log
    save_params: how often (every x iterations) to save the policy 
        parameters to disk. Final parameters are always saved for 
        x>0. If False, they are never saved.
    log_params: whether to include policy parameters in the human-readable logs
    log_grad: whether to include gradients in the human-readable logs
    parallel: number of parallel jobs for simulation. If 0 or False, 
        sequential simulation is performed.
    render: how often (every x iterations) to render the agent's behavior
        on a sample trajectory. If False, no rendering happens
    verbose: level of verbosity on standard output
    """
    #Defaults
    if action_filter is None:
        action_filter = clip(env)

    #Seed agent
    if seed is not None:
        seed_all_agent(seed)

    #Prepare logger
    algo_info = {
        'Algorithm': 'AdaStep',
        'Estimator': estimator,
        'Baseline': baseline,
        'Env': str(env),
        'Horizon': horizon,
        'Discount': disc,
        'Confidence': conf,
        'ConfidenceParam': conf,
        'Seed': seed,
        'BatchSize': batchsize,
        'PenalizationCoefficient': pen_coeff,
        'VarianceBound': var_bound
    }
    logger.write_info({**algo_info, **policy.info()})
    log_keys = [
        'Perf', 'UPerf', 'AvgHorizon', 'StepSize', 'GradNorm', 'Time',
        'StepSize', 'BatchSize', 'Info', 'TotSamples', 'Safety'
    ]
    if log_params:
        log_keys += ['param%d' % i for i in range(policy.num_params())]
    if log_grad:
        log_keys += ['grad%d' % i for i in range(policy.num_params())]
    if test_batchsize:
        log_keys += ['TestPerf', 'TestPerf', 'TestInfo']
    log_row = dict.fromkeys(log_keys)
    logger.open(log_row.keys())

    #Initializations
    it = 0
    tot_samples = 0
    safety = 1.
    _estimator = (reinforce_estimator
                  if estimator == 'reinforce' else gpomdp_estimator)
    updated = False
    updates = 0
    unsafe_updates = 0
    eps = math.sqrt(var_bound / conf)

    #Learning loop
    while (it < iterations and tot_samples < max_samples):
        start = time.time()
        if verbose:
            print('\n* Iteration %d *' % it)
        params = policy.get_flat()

        #Test the corresponding deterministic policy
        if test_batchsize:
            test_batch = generate_batch(env,
                                        policy,
                                        horizon,
                                        episodes=test_batchsize,
                                        action_filter=action_filter,
                                        n_jobs=parallel,
                                        deterministic=True,
                                        key=info_key)
            log_row['TestPerf'] = performance(test_batch, disc)
            log_row['UTestPerf'] = performance(test_batch, 1)
            log_row['TestInfo'] = mean_sum_info(test_batch).item()

        #Render the agent's behavior
        if render and it % render == 0:
            generate_batch(env,
                           policy,
                           horizon,
                           episodes=1,
                           action_filter=action_filter,
                           render=True)

        #Collect trajectories according to fixed batch size
        batch = generate_batch(env,
                               policy,
                               horizon,
                               episodes=batchsize,
                               action_filter=action_filter,
                               n_jobs=parallel,
                               key=info_key)

        #Estimate policy gradient
        grad_samples = _estimator(batch,
                                  disc,
                                  policy,
                                  baselinekind=baseline,
                                  shallow=shallow,
                                  result='samples')
        grad = torch.mean(grad_samples, 0)

        lower = torch.clamp(
            torch.abs(grad) - eps / math.sqrt(batchsize), 0, float('inf'))
        upper = torch.abs(grad) + eps / math.sqrt(batchsize)

        #Update long-term quantities
        tot_samples += batchsize

        #Update safety measure
        if updates == 0:
            old_rets = returns(batch, disc)
        elif updated:
            new_rets = returns(batch, disc)
            tscore, pval = sts.ttest_ind(old_rets, new_rets)
            if pval / 2 < meta_conf and tscore > 0:
                unsafe_updates += 1
                if verbose:
                    print('The previous update was unsafe! (p-value = %f)' %
                          (pval / 2))
            old_rets = new_rets
            safety = 1 - unsafe_updates / updates

        #Log
        log_row['Safety'] = safety
        log_row['Perf'] = performance(batch, disc)
        log_row['Info'] = mean_sum_info(batch).item()
        log_row['UPerf'] = performance(batch, disc=1.)
        log_row['AvgHorizon'] = avg_horizon(batch)
        log_row['GradNorm'] = torch.norm(grad).item()
        log_row['BatchSize'] = batchsize
        log_row['TotSamples'] = tot_samples
        if log_params:
            for i in range(policy.num_params()):
                log_row['param%d' % i] = params[i].item()
        if log_grad:
            for i in range(policy.num_params()):
                log_row['grad%d' % i] = grad[i].item()

        #Check if number of samples is sufficient to perform update
        if torch.norm(lower) == 0:
            updated = False
            if verbose:
                print('No update, would require more samples')

        #Select step size
        stepsize = (torch.norm(lower)**2 /
                    (2 * pen_coeff * torch.sum(upper)**2)).item()
        log_row['StepSize'] = stepsize

        #Update policy parameters
        new_params = params + stepsize * grad
        policy.set_from_flat(new_params)
        updated = True
        updates += 1

        #Save parameters
        if save_params and it % save_params == 0:
            logger.save_params(params, it)

        #Next iteration
        log_row['Time'] = time.time() - start
        if verbose:
            print(separator)
        logger.write_row(log_row, it)
        if verbose:
            print(separator)

        it += 1

    #Save final parameters
    if save_params:
        logger.save_params(params, it)

    #Cleanup
    logger.close()
예제 #9
0
파일: sepg.py 프로젝트: davide1096/potion
m = sum(env.observation_space.shape)
d = sum(env.action_space.shape)
mu_init = torch.zeros(m)
logstd_init = torch.log(torch.zeros(1) + args.sigmainit)
policy = ShallowGaussianPolicy(m,
                               d,
                               mu_init=mu_init,
                               logstd_init=logstd_init,
                               learn_std=True)

if args.safety == 'mi':
    safety_req = MonotonicImprovement(0.)
elif args.safety == 'budget':
    from potion.common.misc_utils import seed_all_agent
    seed_all_agent(args.seed)
    env.seed(args.seed)
    batch = generate_batch(env, policy, args.horizon, args.batchsize)
    perf = performance(batch, args.gamma)
    safety_req = Budget(initial_perf=perf)
else:
    safety_req = FixedThreshold(threshold=float(args.safety))

env.seed(args.seed)

test_batchsize = 100 if args.test else 0

envname = re.sub(r'[^a-zA-Z]', "", args.env)[:-1]
envname = re.sub(r'[^a-zA-Z]', "", args.env)[:-1].lower()
logname = envname + '_' + args.name + '_' + str(args.seed)
예제 #10
0
        _sample = torch.sum(G * values * mask.unsqueeze(1), 0)
        if result == 'samples':
            return _sample, res_2, res_3
        else:
            return incr_mean(cum_1, _sample, tot_trajs), res_2, res_3
        
"""Testing"""
if __name__ == '__main__':
    from potion.actors.continuous_policies import ShallowGaussianPolicy as Gauss
    from potion.simulation.trajectory_generators import generate_batch
    from potion.common.misc_utils import seed_all_agent
    import potion.envs
    import gym.spaces
    env = gym.make('ContCartPole-v0')
    env.seed(0)
    seed_all_agent(0)
    N = 100
    H = 100
    disc = 0.99
    pol = Gauss(4,1, mu_init=[0.,0.,0.,0.], learn_std=True)
    
    batch = generate_batch(env, pol, H, N)
    
    o = gpomdp_estimator(batch, disc, pol, baselinekind='peters', 
                         shallow=True)
    print('Shallow GPOMDP (peters):', o)
    #o = gpomdp_estimator(batch, disc, pol, baselinekind='peters')
    #print('GPOMDP (peters)', o)
    #print()
    
    print('Cum version')
예제 #11
0
def mepg(env,
         policy,
         horizon,
         batchsize=500,
         iterations=200,
         disc=0.99,
         alpha=1e-1,
         eta=1e-3,
         clip_at=100,
         test_batchsize=False,
         render=False,
         seed=None,
         action_filter=None,
         parallel=False,
         logger=Logger(name='MEPG'),
         save_params=50,
         log_params=True,
         verbose=True):
    """
        MEPG algorithm
        Only for shallow Gaussian policy w/ scalar variance
    """

    #Defaults
    assert type(policy) == ShallowGaussianPolicy
    assert policy.learn_std
    if action_filter is None:
        action_filter = clip(env)

    #Seed agent
    if seed is not None:
        seed_all_agent(seed)

    #Prepare logger
    algo_info = {
        'Algorithm': 'MEPG',
        'Environment': str(env),
        'BatchSize': batchsize,
        'Horizon': horizon,
        'Iterations': iterations,
        'Disc': disc,
        'Alpha': alpha,
        'Eta': eta,
        'Seed': seed,
        'ActionFilter': action_filter
    }
    logger.write_info({**algo_info, **policy.info()})
    log_keys = [
        'Perf', 'UPerf', 'AvgHorizon', 'StepSize', 'MetaStepSize', 'BatchSize',
        'Exploration', 'OmegaGrad', 'OmegaMetagrad', 'upsilonGradNorm'
    ]
    if log_params:
        log_keys += ['param%d' % i for i in range(policy.num_params())]
    if test_batchsize:
        log_keys.append('DetPerf')
    log_row = dict.fromkeys(log_keys)

    logger.open(log_row.keys())

    #Learning loop
    it = 0
    while (it < iterations):
        #Begin iteration
        if verbose:
            print('\nIteration ', it)
        if verbose:
            print('Params: ', policy.get_flat())

        #Test mean parameters on deterministic policy
        if test_batchsize:
            test_batch = generate_batch(env,
                                        policy,
                                        horizon,
                                        test_batchsize,
                                        action_filter=action_filter,
                                        seed=seed,
                                        njobs=parallel,
                                        deterministic=True)
            log_row['DetPerf'] = performance(test_batch, disc)
        #Render behavior
        if render:
            generate_batch(env, policy, horizon, 1, action_filter, render=True)

        #Set metaparameters
        omega = policy.get_scale_params()
        sigma = torch.exp(omega)
        stepsize = alpha * sigma**2

        #Collect trajectories
        batch = generate_batch(env,
                               policy,
                               horizon,
                               batchsize,
                               action_filter=action_filter,
                               seed=seed,
                               n_jobs=parallel)

        #Estimate policy gradient
        grad = gpomdp_estimator(batch,
                                disc,
                                policy,
                                baselinekind='peters',
                                shallow=True)
        upsilon_grad = grad[1:]
        omega_grad = grad[0]

        omega_metagrad = metagrad(batch,
                                  disc,
                                  policy,
                                  alpha,
                                  clip_at,
                                  grad=grad)

        upsilon = policy.get_loc_params()
        new_upsilon = upsilon + stepsize * upsilon_grad
        policy.set_loc_params(new_upsilon)

        new_omega = omega + eta * omega_metagrad
        policy.set_scale_params(new_omega)

        # Log
        log_row['Exploration'] = policy.exploration()
        log_row['StepSize'] = stepsize.item()
        log_row['MetaStepSize'] = eta
        log_row['OmegaGrad'] = omega_grad.item()
        log_row['OmegaMetagrad'] = omega_metagrad.item()
        log_row['UpsilonGradNorm'] = torch.norm(upsilon_grad).item()
        log_row['BatchSize'] = batchsize
        log_row['Perf'] = performance(batch, disc)
        log_row['UPerf'] = performance(batch, 1.)
        log_row['AvgHorizon'] = avg_horizon(batch)
        params = policy.get_flat()
        if log_params:
            for i in range(policy.num_params()):
                log_row['param%d' % i] = params[i].item()
        logger.write_row(log_row, it)
        if save_params and it % save_params == 0:
            logger.save_params(params, it)

        # Next iteration
        it += 1

    # Final policy
    if save_params:
        logger.save_params(params, it)
예제 #12
0
파일: sepg.py 프로젝트: davide1096/potion
def ssepg(env,
          policy,
          horizon,
          batchsize=100,
          iterations=200,
          disc=0.99,
          pow_alpha=0.01,
          pow_err_tol=0.1,
          max_pow_it=100,
          max_pow_attempts=3,
          pow_clip=0.2,
          safety_req=MonotonicImprovement(0.),
          conf=0.2,
          adapt_batchsize=False,
          test_batchsize=False,
          render=False,
          seed=None,
          action_filter=None,
          parallel=False,
          logger=Logger(name='SEPG'),
          save_params=50,
          log_params=True,
          verbose=True):
    """
        SSEPG algorithm
        Only for shallow Gaussian policy w/ scalar variance
    """

    #Defaults
    assert type(policy) == ShallowGaussianPolicy
    assert policy.learn_std
    if action_filter is None:
        action_filter = clip(env)

    #Seed agent
    if seed is not None:
        seed_all_agent(seed)

    #Prepare logger
    algo_info = {
        'Algorithm': 'MEPG',
        'Environment': str(env),
        'BatchSize': batchsize,
        'Horizon': horizon,
        'Iterations': iterations,
        'Disc': disc,
        'Seed': seed,
        'ActionFilter': action_filter
    }
    logger.write_info({**algo_info, **policy.info()})
    log_keys = [
        'Perf', 'UPerf', 'AvgHorizon', 'StepSize', 'MetaStepSize', 'BatchSize',
        'Exploration', 'OmegaGrad', 'OmegaMetagrad', 'UpsilonGradNorm',
        'UpsilonGradVar', 'UpsilonEps', 'OmegaGradVar', 'OmegaEps', 'Req',
        'MinBatchSize', 'MaxReq', 'UpsilonLip', 'OmegaLip'
    ]
    if log_params:
        log_keys += ['param%d' % i for i in range(policy.num_params())]
    if test_batchsize:
        log_keys.append('DetPerf')
    log_row = dict.fromkeys(log_keys)

    logger.open(log_row.keys())

    #Learning loop
    it = 0
    stepsize = 0.
    metastepsize = 0.
    omega_grad_var = 0.
    omega_eps = 0.
    omega_metagrad = torch.zeros(1)
    G = 0.
    while (it < iterations):
        #Begin iteration
        if verbose:
            print('\nIteration ', it)
        if verbose:
            print('Params: ', policy.get_flat())

        #Test mean parameters on deterministic policy
        if test_batchsize:
            test_batch = generate_batch(env,
                                        policy,
                                        horizon,
                                        test_batchsize,
                                        action_filter=action_filter,
                                        seed=seed,
                                        njobs=parallel,
                                        deterministic=True)
            log_row['DetPerf'] = performance(test_batch, disc)
        #Render behavior
        if render:
            generate_batch(env, policy, horizon, 1, action_filter, render=True)

        #Set metaparameters
        omega = policy.get_scale_params()
        sigma = torch.exp(omega)

        #Collect trajectories
        batch = generate_batch(env,
                               policy,
                               horizon,
                               batchsize,
                               action_filter=action_filter,
                               seed=seed,
                               n_jobs=parallel)
        perf = performance(batch, disc)

        #Estimate policy gradient
        grad_samples = gpomdp_estimator(batch,
                                        disc,
                                        policy,
                                        baselinekind='peters',
                                        shallow=True,
                                        result='samples')
        grad = torch.mean(grad_samples, 0)
        upsilon_grad = grad[1:]
        upsilon_grad_norm = torch.norm(upsilon_grad)
        omega_grad = grad[0]
        dfn = upsilon_grad.shape[0]

        ### Mean-update iteration
        if it % 2 == 0:
            #Compute gradient estimation error for mean parameters
            if conf < 1 and grad_samples.size()[1] > 2:
                centered = grad_samples[:, 1:] - upsilon_grad.unsqueeze(0)
                grad_cov = batchsize / (batchsize - 1) * torch.mean(
                    torch.bmm(centered.unsqueeze(2), centered.unsqueeze(1)), 0)
                upsilon_grad_var = torch.sum(torch.diag(grad_cov)).item()
                max_eigv = eigsh(grad_cov.numpy(), 1)[0][0]
                quant = sts.f.ppf(1 - conf, dfn, batchsize - dfn)
                upsilon_eps = math.sqrt(max_eigv * dfn * quant)
            elif conf < 1:
                upsilon_grad_var = torch.var(grad_samples[:, 1]).item()
                quant = sts.t.ppf(1 - conf / 2, batchsize - 1)
                upsilon_eps = quant * math.sqrt(upsilon_grad_var)
            else:
                upsilon_eps = 0.
                upsilon_grad_var = 0.

            #Compute safe step size for mean parameters
            mask = torch.ones_like(grad)
            mask[0] = 0.
            F = power(policy,
                      batch,
                      grad,
                      disc,
                      pow_alpha=pow_alpha,
                      err_tol=pow_err_tol,
                      max_it=max_pow_it,
                      max_attempts=max_pow_attempts,
                      shallow=True,
                      clip=pow_clip,
                      verbose=verbose,
                      mask=mask)
            req = safety_req.next(perf)
            max_req = (upsilon_grad_norm - upsilon_eps / math.sqrt((batchsize - dfn)))**2 / \
                        (2 * F)
            req = min(req, max_req)
            alpha = (upsilon_grad_norm - upsilon_eps / math.sqrt((batchsize - dfn)))  / \
                        F * \
                        (1 + math.sqrt(1 - req / max_req))
            stepsize = (alpha / upsilon_grad_norm).item()

            #Ensure minimum safe batchsize
            min_batchsize = math.ceil(
                (upsilon_eps**2 / upsilon_grad_norm**2).item() + 1e-12) + dfn
            if conf < 1 and adapt_batchsize:
                batchsize = max(batchsize, min_batchsize)

            #Update mean parameters
            upsilon = policy.get_loc_params()
            new_upsilon = upsilon + alpha * sigma**2 * upsilon_grad / upsilon_grad_norm
            policy.set_loc_params(new_upsilon)
        ###
        ### Variance-update iteration
        else:
            #Estimate meta gradient (alpha from previous step)
            omega_metagrad = metagrad(batch,
                                      disc,
                                      policy,
                                      alpha,
                                      grad_samples=grad_samples)
            omega_metagrad_norm = torch.norm(omega_metagrad)

            #Compute gradient estimation error for variance parameter
            if conf < 1:
                omega_grad_var = torch.var(grad_samples[:, 0]).item()
                quant = sts.t.ppf(1 - conf / 2, batchsize - 1)
                omega_eps = quant * math.sqrt(omega_grad_var / batchsize)
            else:
                omega_grad_var = 0.
                omega_eps = 0.

            #Compute safe meta step size
            mask = torch.zeros_like(grad)
            mask[0] = 1.
            G = power(policy,
                      batch,
                      grad,
                      disc,
                      pow_alpha=pow_alpha,
                      err_tol=pow_err_tol,
                      max_it=max_pow_it,
                      max_attempts=max_pow_attempts,
                      shallow=True,
                      clip=pow_clip,
                      verbose=verbose,
                      mask=mask)
            req = safety_req.next(perf)
            proj = omega_grad.view(-1).dot(
                omega_metagrad.view(-1)) / torch.norm(omega_metagrad)
            max_req = (torch.abs(proj) -
                       omega_eps / math.sqrt(batchsize))**2 / (2 * G)
            req = min(req, max_req)
            eta = (torch.abs(proj) - omega_eps / math.sqrt(batchsize)) / \
                    G * \
                    (torch.sign(proj) + torch.sqrt(1 - req / max_req))
            metastepsize = (eta / omega_metagrad_norm).item()

            #Ensure minimum safe batchsize
            min_batchsize = math.ceil((omega_eps**2 / proj**2).item() + 1e-12)
            if conf < 1 and adapt_batchsize:
                batchsize = max(batchsize, min_batchsize)

            #Update variance parameters
            new_omega = omega + eta * omega_metagrad / omega_metagrad_norm
            policy.set_scale_params(new_omega)

        # Log
        log_row['UpsilonLip'] = F
        log_row['OmegaLip'] = G
        log_row['Exploration'] = sigma.item()
        log_row['StepSize'] = stepsize
        log_row['MetaStepSize'] = metastepsize
        log_row['OmegaGrad'] = omega_grad.item()
        log_row['OmegaMetagrad'] = omega_metagrad.item()
        log_row['UpsilonGradNorm'] = torch.norm(upsilon_grad).item()
        log_row['BatchSize'] = batchsize
        log_row['Perf'] = perf
        log_row['UPerf'] = performance(batch, 1.)
        log_row['AvgHorizon'] = avg_horizon(batch)
        log_row['UpsilonGradVar'] = upsilon_grad_var
        log_row['UpsilonEps'] = upsilon_eps
        log_row['OmegaGradVar'] = upsilon_grad_var
        log_row['OmegaEps'] = upsilon_eps
        log_row['Req'] = req
        print(min_batchsize)
        log_row['MinBatchSize'] = min_batchsize
        log_row['MaxReq'] = max_req.item()
        params = policy.get_flat()
        if log_params:
            for i in range(policy.num_params()):
                log_row['param%d' % i] = params[i].item()
        logger.write_row(log_row, it)
        if save_params and it % save_params == 0:
            logger.save_params(params, it)

        # Next iteration
        it += 1

    # Final policy
    if save_params:
        logger.save_params(params, it)
예제 #13
0
from potion.common.misc_utils import seed_all_agent
from potion.meta.smoothing_constants import gauss_lip_const
import matplotlib.pyplot as plt
from potion.estimation.gradients import gpomdp_estimator

env = gym.make('lqr1d-v0')
std = 0.1
disc = 0.9
horizon = 20
batchsize = 100
points = 50
max_feat = env.max_pos
max_rew = env.Q * env.max_pos**2 + env.R * env.max_action**2
seed = 0
env.seed(seed)
seed_all_agent(seed)
pol = ShallowGaussianPolicy(1, 1, learn_std=False, logstd_init=np.log(std))

estimated = []
bound = []
real = []
fo = []
op = []
params = np.linspace(-1., 0., points)
it = 0
for param in params:
    plt.close()
    it+=1
    print(it)
    pol.set_from_flat([param])
    batch = generate_batch(env, pol, horizon, batchsize)
예제 #14
0
def semisafepg(env, policy, horizon, *,
                    conf = 0.05,
                    min_batchsize = 32,
                    max_batchsize = 5000,
                    iterations = float('inf'),
                    max_samples = 1e6,
                    disc = 0.9,
                    forget = 0.1,
                    action_filter = None,
                    estimator = 'gpomdp',
                    baseline = 'peters',
                    logger = Logger(name='SSPG'),
                    shallow = True,
                    pow_step = 0.01,
                    pow_decay = 0.99,
                    pow_it = 100,
                    pow_tol = 0.05,
                    pow_clip = 0.1,
                    fast = False,
                    meta_conf = 0.05,
                    seed = None,
                    test_batchsize = False,
                    info_key = 'danger',
                    save_params = 100,
                    log_params = True,
                    log_grad = False,
                    parallel = False,
                    render = False,
                    verbose = 1):
    """
    Semi-safe PG algorithm from "Smoothing Policies and Safe Policy Gradients,
                                    Papini et al., 2019
        
    env: environment
    policy: the one to improve
    horizon: maximum task horizon
    conf: probability of unsafety (per update)
    min_batchsize: minimum number of trajectories used to estimate policy 
        gradient
    max_batchsize: maximum number of trajectories used to estimate policy 
        gradient
    iterations: maximum number of learning iterations
    max_samples: maximum number of total trajectories 
    disc: discount factor
    forget: decay of the (estimated) global gradient Lipscthiz constant
    action_filter: function to apply to the agent's action before feeding it to 
        the environment, not considered in gradient estimation. By default,
        the action is clipped to satisfy evironmental boundaries
    estimator: either 'reinforce' or 'gpomdp' (default). The latter typically
        suffers from less variance
    baseline: control variate to be used in the gradient estimator. Either
        'avg' (average reward, default), 'peters' (variance-minimizing) or
        'zero' (no baseline)
    logger: for human-readable logs (standard output, csv, tensorboard)
    shallow: whether to employ pre-computed score functions (only available for
        shallow policies)
    pow_step: step size of the power method
    pow_decay: initial decay parameter of the power method
    pow_it: maximum number of iterations (per epoch) of the power method
    pow_tol: relative-error tolerance of the power method
    pow_clip: importance-weight clipping parameter for the power method 
        (default 0.2)
    fast: whether to pursue maximum convergence speed 
        (under safety constraints)
    meta_conf: confidence level of safe update test (for evaluation)
    seed: random seed (None for random behavior)
    test_batchsize: number of test trajectories used to evaluate the 
        corresponding deterministic policy at each iteration. If False, no 
        test is performed
    info_key: name of the environment info to log
    save_params: how often (every x iterations) to save the policy 
        parameters to disk. Final parameters are always saved for 
        x>0. If False, they are never saved.
    log_params: whether to include policy parameters in the human-readable logs
    log_grad: whether to include gradients in the human-readable logs
    parallel: number of parallel jobs for simulation. If False, 
        sequential simulation is performed.
    render: how often (every x iterations) to render the agent's behavior
        on a sample trajectory. If False, no rendering happens
    verbose: level of verbosity
    """
    #Defaults
    if action_filter is None:
        action_filter = clip(env)
    
    #Seed agent
    if seed is not None:
        seed_all_agent(seed)
    
    #Prepare logger
    algo_info = {'Algorithm': 'SSPG',
                   'Estimator': estimator,
                   'Baseline': baseline,
                   'Env': str(env), 
                   'Horizon': horizon,
                   'Discount': disc,
                   'Confidence': conf,
                   'ConfidenceParam': conf,
                   'Seed': seed,
                   'MinBatchSize': min_batchsize,
                   'MaxBatchSize': max_batchsize,
                   'ForgetParam': forget,
                   'PowerStep': pow_step,
                   'PowerDecay': pow_decay,
                   'PowerIters': pow_it,
                   'PowerTolerance': pow_tol,
                   'Fast': fast
                   }
    logger.write_info({**algo_info, **policy.info()})
    log_keys = ['Perf', 
                'UPerf', 
                'AvgHorizon', 
                'StepSize', 
                'GradNorm', 
                'Time',
                'StepSize',
                'BatchSize',
                'LipConst',
                'ErrBound',
                'SampleVar',
                'Info',
                'TotSamples',
                'Safety',
                'UScore']
    if log_params:
        log_keys += ['param%d' % i for i in range(policy.num_params())]
    if log_grad:
        log_keys += ['grad%d' % i for i in range(policy.num_params())]
    if test_batchsize:
        log_keys += ['TestPerf', 'TestPerf', 'TestInfo']
    log_row = dict.fromkeys(log_keys)
    logger.open(log_row.keys())
    
    #Initializations
    it = 0
    updated = False
    updates = 0
    unsafe_updates = 0
    safety = 1.
    tot_samples = 0
    optimal_batchsize = min_batchsize
    min_safe_batchsize = min_batchsize
    _conf = conf
    _estimator = (reinforce_estimator 
                  if estimator=='reinforce' else gpomdp_estimator)
    old_lip_const = 0.
    dfn = policy.get_flat().shape[0]
    min_batchsize = max(min_batchsize, dfn + 1)
    
    #Learning loop
    while(it < iterations and tot_samples < max_samples):
        start = time.time()
        if verbose:
            print('\n* Iteration %d *' % it)
        params = policy.get_flat()
        
        #Test the corresponding deterministic policy
        if test_batchsize:
            test_batch = generate_batch(env, policy, horizon, 
                                        episodes=test_batchsize, 
                                        action_filter=action_filter,
                                        n_jobs=parallel,
                                        deterministic=True,
                                        key=info_key)
            log_row['TestPerf'] = performance(test_batch, disc)
            log_row['UTestPerf'] = performance(test_batch, 1)
            log_row['TestInfo'] = mean_sum_info(test_batch).item()
        
        #Render the agent's behavior
        if render and it % render==0:
            generate_batch(env, policy, horizon,
                           episodes=1,
                           action_filter=action_filter, 
                           render=True)
    
        #Collect trajectories according to target batch size
        target_batchsize = min_safe_batchsize if fast else optimal_batchsize
        batch = generate_batch(env, policy, horizon, 
                                episodes=max(min_batchsize, 
                                             min(max_batchsize, 
                                                 target_batchsize)), 
                                action_filter=action_filter,
                                n_jobs=parallel,
                                key=info_key)
        batchsize = len(batch)
        
        #Collect more trajectories to match minimum safe batch size
        do = True
        while do or batchsize < min_safe_batchsize:
            do = False
            batch += generate_batch(env, policy, horizon, 
                        episodes=(min(max_batchsize, min_safe_batchsize) 
                                    - batchsize), 
                        action_filter=action_filter,
                        n_jobs=parallel,
                        key=info_key)
            batchsize = len(batch)
            
            #Estimate policy gradient
            grad_samples = _estimator(batch, disc, policy, 
                                        baselinekind=baseline, 
                                        shallow=shallow,
                                        result='samples')
            grad = torch.mean(grad_samples, 0)
                
            #Compute estimation error with ellipsoid confidence region
            centered = grad_samples - grad.unsqueeze(0)
            grad_cov = (batchsize/(batchsize - 1) * 
                        torch.mean(torch.bmm(centered.unsqueeze(2), 
                                             centered.unsqueeze(1)),0))
            grad_var = torch.sum(torch.diag(grad_cov)).item() #for humans
            max_eigv = eigsh(grad_cov.numpy(), 1)[0][0]
            quant = sts.f.ppf(1 - _conf, dfn, batchsize - dfn)
            eps = math.sqrt(max_eigv * dfn * quant)
            
            #Optimal batch size
            optimal_batchsize = torch.ceil(4 * eps**2 / 
                                   (torch.norm(grad)**2) + dfn).item()
            min_safe_batchsize = torch.ceil(eps**2 / 
                                            torch.norm(grad)**2 + dfn).item()
            target_batchsize = (min_safe_batchsize if fast 
                                else optimal_batchsize)
            if verbose and optimal_batchsize < max_batchsize:
                print('Collected %d / %d trajectories' 
                      % (batchsize, target_batchsize))
            elif verbose:
                print('Collected %d / %d trajectories' 
                      % (batchsize, min(max_batchsize, target_batchsize)))
            
            #Adjust confidence before collecting more data for the same update
            if batchsize >= max_batchsize:
                break
            _conf /= 2
        
        if verbose:
            print('Optimal batch size: %d' 
                  % (optimal_batchsize if optimal_batchsize < float('inf') 
                      else -1))
            print('Minimum safe batch size: %d' 
                  % (min_safe_batchsize if min_safe_batchsize < float('inf') 
                      else -1))
            if (batchsize >= min_safe_batchsize 
                and batchsize < optimal_batchsize):
                print('Low sample regime')
                
        #Update safety measure
        if updates == 0:
            old_rets= returns(batch, disc)
        elif updated:
            new_rets = returns(batch, disc)
            tscore, pval = sts.ttest_ind(old_rets, new_rets)
            if pval / 2 < meta_conf and tscore > 0:
                unsafe_updates += 1
                if verbose:
                    print('The previous update was unsafe! (p-value = %f)' 
                          % (pval / 2))
            old_rets = new_rets
            safety = 1 - unsafe_updates / updates

        #Update long-term quantities
        tot_samples += batchsize
        
        #Log
        log_row['SampleVar'] = grad_var
        log_row['UScore'] = torch.norm(grad).item() / math.sqrt(grad_var)
        log_row['Safety'] = safety
        log_row['ErrBound'] = eps
        log_row['Perf'] = performance(batch, disc)
        log_row['Info'] = mean_sum_info(batch).item()
        log_row['UPerf'] = performance(batch, disc=1.)
        log_row['AvgHorizon'] = avg_horizon(batch)
        log_row['GradNorm'] = torch.norm(grad).item()
        log_row['BatchSize'] = batchsize
        log_row['TotSamples'] = tot_samples
        if log_params:
            for i in range(policy.num_params()):
                log_row['param%d' % i] = params[i].item()
        if log_grad:
            for i in range(policy.num_params()):
                log_row['grad%d' % i] = grad[i].item()
                
        #Check if number of samples is sufficient to perform update
        if batchsize < min_safe_batchsize:
            updated = False
            if verbose:
                print('No update, would require more samples than allowed')
            #Log
            log_row['LipConst'] = old_lip_const
            log_row['StepSize'] = 0.
            log_row['Time'] = time.time() - start
            if verbose:
                print(separator)
            logger.write_row(log_row, it)
            if verbose:
                print(separator)
            
            #Adjust confidence before collecting new data for the same update
            _conf /= 2
            
            #Skip to next iteration (current trajectories are discarded)
            it += 1
            continue
        
        #Reset confidence for next update
        _conf = conf
        
        #Estimate gradient Lipschitz constant with off-policy Power Method
        lip_const = power(policy, batch, grad, disc, 
              step=pow_step, 
              decay_rate=pow_decay,
              tol=pow_tol, 
              max_it=pow_it, 
              estimator=_estimator, 
              baseline=baseline, 
              shallow=shallow, 
              clip=pow_clip,
              verbose=verbose)
        
        #Update "global" lipschitz constant
        if it > 0:
            lip_const = (1 - forget) * max(lip_const, old_lip_const) + forget * lip_const
        old_lip_const = lip_const
        log_row['LipConst'] = lip_const
        
        #Select step size
        stepsize = 1. / lip_const * (1 - eps / (torch.norm(grad) 
                                        * math.sqrt(batchsize - dfn)).item())
        if fast:
            stepsize *= 2
        log_row['StepSize'] = stepsize
                
        #Update policy parameters
        new_params = params + stepsize * grad
        policy.set_from_flat(new_params)
        updated = True
        updates += 1
        
        #Save parameters
        if save_params and it % save_params == 0:
            logger.save_params(params, it)
        
        #Next iteration
        log_row['Time'] = time.time() - start
        if verbose:
            print(separator)
        logger.write_row(log_row, it)
        if verbose:
            print(separator)
        it += 1
    
    #Save final parameters
    if save_params:
        logger.save_params(params, it)
    
    #Cleanup
    logger.close()