Exemplo n.º 1
0
        # Gradient clipping
        if args.outer_clip > 0:
            theta_grad = theta_grad * jnp.minimum(
                1., args.outer_clip / (jnp.linalg.norm(theta_grad) + 1e-8))

        theta, optim_params = optimizer_step(theta, theta_grad, optim_params,
                                             i)

        if i % args.log_interval == 0:
            L, _ = unroll(jnp.array(initial_point), theta, 0, args.T,
                          args.T)  # Evaluate on the full unroll
            iteration_logger.writerow({
                'time_elapsed': time.time() - start_time,
                'iteration': i,
                'inner_problem_steps': i * args.K,
                'theta0': float(theta[0]),
                'theta1': float(theta[1]),
                'theta0_grad': float(theta_grad[0]),
                'theta1_grad': float(theta_grad[1]),
                'L': float(L)
            })

            print(
                'Time: {:6.3f} | Meta-iter: {} | theta: {} | theta_grad: {} | L: {:6.3f}'
                .format(time.time() - log_start_time, i, jnp.exp(theta),
                        theta_grad, float(L)))
            sys.stdout.flush()
            log_start_time = time.time()

elif args.estimate == 'rtrl':

    dstate_dtheta = None
Exemplo n.º 2
0
    if iteration % args.log_every == 0:
        elapsed_time += time.time() - start_time
        fresh_env = gym.make(args.env_name)
        fresh_state = fresh_env.reset()
        total_reward, _, _, _, _ = unroll(theta,
                                          fresh_state,
                                          fresh_env,
                                          0,
                                          args.horizon,
                                          args.horizon,
                                          training=False,
                                          shift=0.0)
        print('time: {} | i: {} | theta_grad_norm: {:6.4f} | total_reward: {}'.
              format(elapsed_time, iteration, jnp.linalg.norm(theta_grad),
                     total_reward))
        sys.stdout.flush()

        iteration_logger.writerow({
            'time':
            elapsed_time,
            'iteration':
            iteration,
            'total_steps':
            total_count,
            'reward':
            total_reward,
            'theta_grad_norm':
            jnp.linalg.norm(theta_grad),
        })
        start_time = time.time()
Exemplo n.º 3
0
        iteration_log_dict = {
            'perf/time_elapsed': time.time() - start_time,
            'perf/outer_iteration': outer_iteration,
            'perf/total_inner_iterations': total_inner_iterations_including_N,
            'perf/train_sum_loss': mean_stats_dict['train_sum_loss'],
            'perf/train_acc': mean_stats_dict['train_acc'],
            'perf/train_mean_loss': mean_stats_dict['train_mean_loss'],
            'perf/val_sum_loss': mean_stats_dict['val_sum_loss'],
            'perf/val_acc': mean_stats_dict['val_acc'],
            'perf/val_mean_loss': mean_stats_dict['val_mean_loss'],
            'perf/unroll_obj': mean_stats_dict['unroll_obj'],
            **constrained_hparams_to_log,
            **hparams_to_log,
        }

        iteration_logger.writerow(iteration_log_dict)

    if outer_iteration % args.log_every == 0:
        hparams_to_log = {}
        for (param_name, value) in zip(param_fieldnames, theta):
            hparams_to_log[param_name] = value

        constrained_hparams_to_log = {}
        for (param_name, value) in zip(cons_param_fieldnames,
                                       to_constrained(theta)):
            constrained_hparams_to_log[param_name] = value

        hparam_grads_to_log = {}
        for (param_name, value) in zip(param_grad_fieldnames, outer_grad):
            hparam_grads_to_log[param_name] = value
Exemplo n.º 4
0
key = jax.random.PRNGKey(3)
for outer_iteration in range(args.outer_iterations):
  key, skey = jax.random.split(key)
  theta_grad = estimator.grad_estimate(theta)

  if args.outer_clip > 0:
    theta_grad = jnp.clip(theta_grad, -args.outer_clip, args.outer_clip)

  theta_update, theta_opt_state = theta_opt.update(theta_grad, theta_opt_state)
  theta = optax.apply_updates(theta, theta_update)

  _, state_mean = unroll(key, theta, state_mean, args.T, args.K)
  total_inner_iterations += 2 * args.K
  total_inner_iterations_including_N = args.N * total_inner_iterations

  if outer_iteration % args.log_every == 0:
    val_loss = L_v(state_mean.inner_state)
    print('Iter: {} | Total iter: {} | Val Loss: {:6.4f} | Theta: {} | Grad: {}'
          .format(outer_iteration, total_inner_iterations_including_N, val_loss,
                  theta, theta_grad))

    iteration_stat_dict = {
        'outer_iteration': outer_iteration,
        'total_inner_iterations': total_inner_iterations_including_N,
        'val_loss': val_loss,
        'theta': theta,
        'theta_grad': theta_grad
    }
    iteration_logger.writerow(iteration_stat_dict)
                                                                           theta,
                                                                           state,
                                                                           states,
                                                                           perturbation_accums,
                                                                           args.K,
                                                                           args.N,
                                                                           args.sigma)
        loss, state = unroll(theta, state, args.K)
    elif args.estimate == 'uoro':
        key, skey = jax.random.split(key)
        (loss, state, s_tilde, theta_tilde), gradient = uoro_grad(skey,
                                                                  theta,
                                                                  state,
                                                                  s_tilde,
                                                                  theta_tilde)
    elif args.estimate == 'rtrl':
        (loss, state, dstate_dtheta), gradient = rtrl_grad(theta, state, dstate_dtheta)

    lr = args.lr
    theta = theta - lr * gradient
    long_unroll_loss, _ = unroll(theta, jnp.ones(n), 500)
    print('Iter: {} | lr: {:6.4e} | Loss: {:6.4e} | Long unroll loss: {:6.4e} | Theta: {} | Grad: {}'.format(
           i, lr, loss, long_unroll_loss, theta, gradient))

    iteration_logger.writerow({ 'iteration': i,
                                'loss': loss,
                                'long_unroll_loss': long_unroll_loss,
                                'theta': theta,
                                'gradient': gradient
                              })
Exemplo n.º 6
0
        all_eval_rewards = []
        for eval_rollout in range(50):
            fresh_state = fresh_env.reset()
            total_reward, _, _, _, _ = unroll(theta,
                                              fresh_state,
                                              fresh_env,
                                              0,
                                              args.horizon,
                                              args.horizon,
                                              training=False,
                                              shift=0.0)
            all_eval_rewards.append(total_reward)

        all_eval_rewards = onp.array(all_eval_rewards)
        print('time: {} | i: {} | steps: {} | reward: {:6.4f}'.format(
            elapsed_time, iteration, total_count, onp.mean(all_eval_rewards)))
        sys.stdout.flush()
        # --------------------------------------------------------

        iteration_logger.writerow({
            'time': elapsed_time,
            'iteration': iteration,
            'total_steps': total_count,
            'reward_mean': onp.mean(all_eval_rewards),
        })
        start_time = time.time()

    theta_grad = estimator.compute_gradient(theta)
    theta, outer_optim_params = outer_optim_step(theta, -theta_grad,
                                                 outer_optim_params)