示例#1
0
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.

    target_policy = get_target_policy(load_dir, env_name, tabular_obs)

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)

    estimator = TabularDualDice(dataset.spec, gamma)
    estimate = estimator.solve(dataset, target_policy)
    print('estimated per step avg', estimate)
示例#2
0
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    save_dir = FLAGS.save_dir
    step_encoding = FLAGS.step_encoding
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    max_trajectory_length_train = FLAGS.max_trajectory_length_train or max_trajectory_length

    target_policy = get_target_policy(load_dir, env_name, tabular_obs)

    hparam_base = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                   'numtraj{NUM_TRAJ}').format(ENV_NAME=env_name,
                                               TAB=tabular_obs,
                                               ALPHA=alpha,
                                               SEED=seed,
                                               NUM_TRAJ=num_trajectory)
    hparam_data = hparam_base + '_maxtraj{MAX_TRAJ}'.format(
        MAX_TRAJ=max_trajectory_length_train)
    hparam_out = hparam_base + '_maxtraj{MAX_TRAJ}'.format(
        MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_data)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)

    estimator = TabularTeQDice(dataset.spec, gamma, max_trajectory_length,
                               step_encoding)
    estimate = estimator.solve(dataset, target_policy)
    print('estimated per step avg', estimate)

    print('Done!')

    if save_dir is not None:
        if not tf.io.gfile.isdir(save_dir):
            tf.io.gfile.makedirs(save_dir)
        out_fname = os.path.join(
            save_dir, hparam_out + '_enc{ENC}.npy'.format(ENC=step_encoding))
        print('Saving results to', out_fname)
        with tf.io.gfile.GFile(out_fname, 'w') as f:
            np.save(f, estimate.numpy())
示例#3
0
def main(argv):
    load_dir = FLAGS.load_dir
    save_dir = FLAGS.save_dir
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    alpha_target = FLAGS.alpha_target
    gamma = FLAGS.gamma
    nu_learning_rate = FLAGS.nu_learning_rate
    zeta_learning_rate = FLAGS.zeta_learning_rate
    nu_regularizer = FLAGS.nu_regularizer
    zeta_regularizer = FLAGS.zeta_regularizer
    num_steps = FLAGS.num_steps
    batch_size = FLAGS.batch_size

    f_exponent = FLAGS.f_exponent
    primal_form = FLAGS.primal_form

    primal_regularizer = FLAGS.primal_regularizer
    dual_regularizer = FLAGS.dual_regularizer
    kl_regularizer = FLAGS.kl_regularizer
    zero_reward = FLAGS.zero_reward
    norm_regularizer = FLAGS.norm_regularizer
    zeta_pos = FLAGS.zeta_pos

    scale_reward = FLAGS.scale_reward
    shift_reward = FLAGS.shift_reward
    transform_reward = FLAGS.transform_reward

    kl_regularizer = FLAGS.kl_regularizer
    eps_std = FLAGS.eps_std

    def reward_fn(env_step):
        reward = env_step.reward * scale_reward + shift_reward
        if transform_reward is None:
            return reward
        if transform_reward == 'exp':
            reward = tf.math.exp(reward)
        elif transform_reward == 'cuberoot':
            reward = tf.sign(reward) * tf.math.pow(tf.abs(reward), 1.0 / 3.0)
        else:
            raise ValueError(
                'Reward {} not implemented.'.format(transform_reward))
        return reward

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    train_hparam_str = (
        'nlr{NLR}_zlr{ZLR}_zeror{ZEROR}_preg{PREG}_dreg{DREG}_kreg{KREG}_nreg{NREG}_'
        'pform{PFORM}_fexp{FEXP}_zpos{ZPOS}_'
        'scaler{SCALER}_shiftr{SHIFTR}_transr{TRANSR}').format(
            NLR=nu_learning_rate,
            ZLR=zeta_learning_rate,
            ZEROR=zero_reward,
            PREG=primal_regularizer,
            DREG=dual_regularizer,
            KREG=kl_regularizer,
            NREG=norm_regularizer,
            PFORM=primal_form,
            FEXP=f_exponent,
            ZPOS=zeta_pos,
            SCALER=scale_reward,
            SHIFTR=shift_reward,
            TRANSR=transform_reward,
        )

    train_hparam_str = ('eps{EPS}_kl{KL}').format(EPS=eps_std,
                                                  KL=kl_regularizer)

    if save_dir is not None:
        target_hparam_str = hparam_str.replace(
            'alpha{}'.format(alpha),
            'alpha{}_alphat{}'.format(alpha, alpha_target))
        save_dir = os.path.join(save_dir, target_hparam_str, train_hparam_str)
        summary_writer = tf.summary.create_file_writer(logdir=save_dir)
        summary_writer.set_as_default()
    else:
        tf.summary.create_noop_writer()

    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset from', directory)
    dataset = Dataset.load(directory)
    #dataset = Dataset.load(directory.replace('alpha{}'.format(alpha), 'alpha0.0'))

    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)
    print('behavior per-step',
          estimator_lib.get_fullbatch_average(dataset, gamma=gamma))

    activation_fn = tf.nn.relu
    kernel_initializer = tf.keras.initializers.GlorotUniform()
    hidden_dims = (64, 64)
    input_spec = (dataset.spec.observation, dataset.spec.action)
    nu_network = ValueNetwork(input_spec,
                              output_dim=2,
                              fc_layer_params=hidden_dims,
                              activation_fn=activation_fn,
                              kernel_initializer=kernel_initializer,
                              last_kernel_initializer=kernel_initializer)
    output_activation_fn = tf.math.square if zeta_pos else tf.identity
    zeta_network = ValueNetwork(input_spec,
                                output_dim=2,
                                fc_layer_params=hidden_dims,
                                activation_fn=activation_fn,
                                output_activation_fn=output_activation_fn,
                                kernel_initializer=kernel_initializer,
                                last_kernel_initializer=kernel_initializer)

    nu_optimizer = tf.keras.optimizers.Adam(nu_learning_rate)
    zeta_optimizer = tf.keras.optimizers.Adam(zeta_learning_rate)
    lam_optimizer = tf.keras.optimizers.Adam(nu_learning_rate)

    estimator = NeuralBayesDice(dataset.spec,
                                nu_network,
                                zeta_network,
                                nu_optimizer,
                                zeta_optimizer,
                                lam_optimizer,
                                gamma,
                                zero_reward=zero_reward,
                                f_exponent=f_exponent,
                                primal_form=primal_form,
                                reward_fn=reward_fn,
                                primal_regularizer=primal_regularizer,
                                dual_regularizer=dual_regularizer,
                                kl_regularizer=kl_regularizer,
                                eps_std=FLAGS.eps_std,
                                norm_regularizer=norm_regularizer,
                                nu_regularizer=nu_regularizer,
                                zeta_regularizer=zeta_regularizer)

    global_step = tf.Variable(0, dtype=tf.int64)
    tf.summary.experimental.set_step(global_step)

    target_policy = get_target_policy(load_dir, env_name, tabular_obs,
                                      alpha_target)
    running_losses = []
    all_dual = []
    for step in range(num_steps):
        transitions_batch = dataset.get_step(batch_size, num_steps=2)
        initial_steps_batch, _ = dataset.get_episode(batch_size,
                                                     truncate_episode_at=1)
        initial_steps_batch = tf.nest.map_structure(lambda t: t[:, 0, ...],
                                                    initial_steps_batch)
        losses = estimator.train_step(initial_steps_batch, transitions_batch,
                                      target_policy)
        running_losses.append(losses)
        if step % 500 == 0 or step == num_steps - 1:
            num_samples = 100
            dual_ests = []
            for i in range(num_samples):
                dual_est = estimator.estimate_average_reward(
                    dataset, target_policy, write_summary=(i == 0))
                dual_ests.append(dual_est)
            tf.summary.scalar('dual/mean', tf.math.reduce_mean(dual_ests))
            tf.summary.scalar('dual/std', tf.math.reduce_std(dual_ests))

            tf.print('dual/mean =', tf.math.reduce_mean(dual_ests),
                     'dual/std =', tf.math.reduce_std(dual_ests))

            all_dual.append(dual_ests)
            running_losses = []
        global_step.assign_add(1)

    if save_dir is not None:
        np.save(tf.io.gfile.GFile(os.path.join(save_dir, 'results.npy'), 'w'),
                all_dual)

    print('Done!')
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    save_dir = FLAGS.save_dir
    gamma = FLAGS.gamma
    num_steps = FLAGS.num_steps
    divergence_limit = FLAGS.divergence_limit
    algae_alpha = FLAGS.algae_alpha
    alpha_learning_rate = FLAGS.alpha_learning_rate
    train_nu_zeta_per_steps = FLAGS.train_nu_zeta_per_steps
    assert 0 <= gamma < 1.
    limit_episodes = FLAGS.limit_episodes

    target_policy = get_target_policy(load_dir, env_name, tabular_obs)

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)

    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)

    train_hparam_str = ('alr{A_LR}_tnzs{TNZS}_limit{LIMIT}_'
                        'gam{GAMMA}_algae{ALGAE_ALPHA}_div{DIV}').format(
                            A_LR=alpha_learning_rate,
                            TNZS=train_nu_zeta_per_steps,
                            LIMIT=limit_episodes,
                            GAMMA=gamma,
                            ALGAE_ALPHA=algae_alpha,
                            DIV=divergence_limit)

    if save_dir is not None:
        save_dir = os.path.join(save_dir, hparam_str, train_hparam_str)
        summary_writer = tf.summary.create_file_writer(logdir=save_dir)
    else:
        summary_writer = tf.summary.create_noop_writer()

    alpha_optimizer = tf.keras.optimizers.Adam(alpha_learning_rate,
                                               beta_1=0.0,
                                               beta_2=0.0)

    episodes, valid_steps = dataset.get_all_episodes(limit=limit_episodes)
    num_samples = tf.reduce_sum(
        tf.cast(
            tf.logical_and(valid_steps, episodes.discount > 0)[:, :-1],
            tf.float32))
    estimator = TabularRobustDice(
        dataset_spec=dataset.spec,
        alpha_optimizer=alpha_optimizer,
        gamma=gamma,
        divergence_limit=  #divergence_limit,
        divergence_limit / num_samples,
        algae_alpha=algae_alpha * np.array([1, 1]),
        limit_episodes=limit_episodes)
    global_step = tf.Variable(0, dtype=tf.int64)
    tf.summary.experimental.set_step(global_step)

    def one_step(transitions_batch, initial_steps_batch, target_policy):
        global_step.assign_add(1)
        #initial_steps_batch = tf.nest.map_structure(lambda t: t[:, 0, ...],
        #                                            initial_steps_batch)
        #losses, _ = estimator.train_alpha(initial_steps_batch, transitions_batch,
        #                                  target_policy)
        #return losses

    with summary_writer.as_default():
        running_losses = []
        running_estimates = []
        for step in range(num_steps):
            if step % train_nu_zeta_per_steps == 0:
                # first solve for the primal nu_loss,
                print('Step: {}. Solve for an updated tabular nu/zeta.'.format(
                    step))
                loss = estimator.solve_nu_zeta(dataset, target_policy)
                running_losses.append(loss)
            one_step(None, None, None)

            if step % 500 == 0 or step == num_steps - 1:
                print('step', step, 'losses', np.mean(running_losses, 0))
                estimate = np.mean(running_losses, 0)[0]
                for idx, est in enumerate(estimate):
                    tf.summary.scalar('estimate%d' % idx, est)
                running_estimates.append(estimate)
                print('estimated per step avg %s' % estimate)
                print('avg last 3 estimated per step avg %s' %
                      np.mean(running_estimates[-3:], axis=0))
                running_losses = []

    if save_dir is not None:
        results_filename = os.path.join(save_dir, 'results.npy')
        with tf.io.gfile.GFile(results_filename, 'w') as f:
            np.save(f, running_estimates)
    print('Done!')
示例#5
0
def main(argv):
  env_name = FLAGS.env_name
  seed = FLAGS.seed
  tabular_obs = FLAGS.tabular_obs
  num_trajectory = FLAGS.num_trajectory
  max_trajectory_length = FLAGS.max_trajectory_length
  alpha = FLAGS.alpha
  load_dir = FLAGS.load_dir
  save_dir = FLAGS.save_dir
  policy_learning_rate = FLAGS.policy_learning_rate
  q_learning_rate = FLAGS.q_learning_rate
  batch_size = FLAGS.batch_size
  mode = FLAGS.mode
  ci_method = FLAGS.ci_method
  delta = FLAGS.delta
  delta_tail = 1 - delta
  gamma = FLAGS.gamma
  num_steps = FLAGS.num_steps
  use_trained_policy = FLAGS.use_trained_policy
  use_doubly_robust = FLAGS.use_doubly_robust
  assert 0 <= gamma < 1.

  target_policy = get_target_policy(load_dir, env_name, tabular_obs)

  hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                    ENV_NAME=env_name,
                    TAB=tabular_obs,
                    ALPHA=alpha,
                    SEED=seed,
                    NUM_TRAJ=num_trajectory,
                    MAX_TRAJ=max_trajectory_length)

  if FLAGS.num_trajectory_data is not None:
    hparam_str_data = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                       'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                           ENV_NAME=env_name,
                           TAB=tabular_obs,
                           ALPHA=alpha,
                           SEED=seed,
                           NUM_TRAJ=FLAGS.num_trajectory_data,
                           MAX_TRAJ=max_trajectory_length)
  else:
    hparam_str_data = hparam_str

  directory = os.path.join(load_dir, hparam_str_data)
  print('Loading dataset.')
  dataset = Dataset.load(directory)
  all_steps = dataset.get_all_steps()
  max_reward = tf.reduce_max(all_steps.reward)
  min_reward = tf.reduce_min(all_steps.reward)
  print('num loaded steps', dataset.num_steps)
  print('num loaded total steps', dataset.num_total_steps)
  print('num loaded episodes', dataset.num_episodes)
  print('num loaded total episodes', dataset.num_total_episodes)
  print('min reward', min_reward, 'max reward', max_reward)

  estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
  print('data per step avg', estimate)

  train_hparam_str = (
      'plr{P_LR}_tp{TRAINED_P}_batch{BATCH_SIZE}_mode{MODE}_CI{CI_METHOD}_UTP{USE_TRAINED_POLICY}_gam{GAMMA}_del{DELTA}'
  ).format(
      P_LR=policy_learning_rate,
      TRAINED_P=use_trained_policy,
      BATCH_SIZE=batch_size,
      MODE=mode,
      CI_METHOD=ci_method,
      USE_TRAINED_POLICY=use_trained_policy,
      GAMMA=gamma,
      DELTA=delta)

  if save_dir is not None:
    save_dir = os.path.join(save_dir, hparam_str, train_hparam_str)
    summary_writer = tf.summary.create_file_writer(logdir=save_dir)
  else:
    summary_writer = tf.summary.create_noop_writer()

  def non_negative_reward_translation(env_step):
    return env_step.reward - min_reward

  def inv_non_negative_estimate_translation(estimate):
    return estimate + min_reward

  if use_trained_policy:
    activation_fn = tf.nn.relu
    kernel_initializer = tf.keras.initializers.GlorotUniform()
    hidden_dims = (64, 64)
    policy_optimizer = tf.keras.optimizers.Adam(
        policy_learning_rate, beta_1=0.0, beta_2=0.0)
    policy_network = PolicyNetwork(
        dataset.spec.observation,
        dataset.spec.action,
        fc_layer_params=hidden_dims,
        activation_fn=activation_fn,
        kernel_initializer=kernel_initializer,
        last_kernel_initializer=kernel_initializer)
  else:
    policy_optimizer = None
    policy_network = None

  if use_doubly_robust:
    activation_fn = tf.nn.relu
    kernel_initializer = tf.keras.initializers.GlorotUniform()
    hidden_dims = (64, 64)
    q_optimizer = tf.keras.optimizers.Adam(q_learning_rate)
    q_network = ValueNetwork(
        (dataset.spec.observation, dataset.spec.action),
        fc_layer_params=hidden_dims,
        activation_fn=activation_fn,
        kernel_initializer=kernel_initializer,
        last_kernel_initializer=kernel_initializer)
  else:
    q_optimizer = None
    q_network = None

  estimator = ImportanceSamplingCI(
      dataset_spec=dataset.spec,
      policy_optimizer=policy_optimizer,
      policy_network=policy_network,
      mode=mode,
      ci_method=ci_method,
      delta_tail=delta_tail,
      gamma=gamma,
      reward_fn=non_negative_reward_translation,
      q_network=q_network,
      q_optimizer=q_optimizer)
  global_step = tf.Variable(0, dtype=tf.int64)
  tf.summary.experimental.set_step(global_step)

  # Following is for policy learning + IS confidence interval
  @tf.function
  def one_step(data_batch):
    global_step.assign_add(1)
    loss = estimator.train_step(data_batch, target_policy)
    return loss

  with summary_writer.as_default():
    running_losses = []
    running_estimates = []
    running_estimate_cis = []
    for step in range(num_steps):
      data_batch = dataset.get_step(batch_size, num_steps=2)
      loss = one_step(data_batch)
      running_losses.append(loss)

      if step % 500 == 0 or step == num_steps - 1:
        print('step', step, 'loss', np.mean(running_losses, 0))
        running_losses = []
        estimate = estimator.estimate_average_reward(
            dataset, target_policy, episode_limit=num_trajectory)
        estimate = inv_non_negative_estimate_translation(estimate)
        running_estimates.append(estimate)
        print('estimated per step avg %s' % estimate)
        print('avg last 3 estimated per step avg %s' %
              np.mean(running_estimates[-3:], axis=0))

        estimate_ci = estimator.estimate_reward_ci(dataset, target_policy)
        estimate_ci = np.array(
            [inv_non_negative_estimate_translation(ele) for ele in estimate_ci])
        running_estimate_cis.append(estimate_ci)
        print('estimated CI per step avg %s' % estimate_ci)
        print('avg last 3 estimated CI per step avg %s' %
              np.mean(running_estimate_cis[-3:], axis=0))

  if save_dir is not None:
    results_filename = os.path.join(save_dir, 'results.npy')
    with tf.io.gfile.GFile(results_filename, 'w') as f:
      np.save(f, running_estimate_cis)
  print('Done!')
def main(argv):
  env_name = FLAGS.env_name
  seed = FLAGS.seed
  tabular_obs = FLAGS.tabular_obs
  num_trajectory = FLAGS.num_trajectory
  max_trajectory_length = FLAGS.max_trajectory_length
  alpha = FLAGS.alpha
  load_dir = FLAGS.load_dir
  save_dir = FLAGS.save_dir
  gamma = FLAGS.gamma
  assert 0 <= gamma < 1.
  nu_learning_rate = FLAGS.nu_learning_rate
  zeta_learning_rate = FLAGS.zeta_learning_rate
  nu_regularizer = FLAGS.nu_regularizer
  zeta_regularizer = FLAGS.zeta_regularizer
  weight_learning_rate = FLAGS.weight_learning_rate
  divergence_limit = FLAGS.divergence_limit
  algae_alpha = FLAGS.algae_alpha
  f_exponent = FLAGS.f_exponent
  primal_form = FLAGS.primal_form
  batch_size = FLAGS.batch_size
  num_steps = FLAGS.num_steps

  target_policy = get_target_policy(load_dir, env_name, tabular_obs)

  hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                    ENV_NAME=env_name,
                    TAB=tabular_obs,
                    ALPHA=alpha,
                    SEED=seed,
                    NUM_TRAJ=num_trajectory,
                    MAX_TRAJ=max_trajectory_length)
  directory = os.path.join(load_dir, hparam_str)
  print('Loading dataset.')
  dataset = Dataset.load(directory)
  all_steps = dataset.get_all_steps()
  max_reward = tf.reduce_max(all_steps.reward)
  min_reward = tf.reduce_min(all_steps.reward)
  print('num loaded steps', dataset.num_steps)
  print('num loaded total steps', dataset.num_total_steps)
  print('num loaded episodes', dataset.num_episodes)
  print('num loaded total episodes', dataset.num_total_episodes)
  print('min reward', min_reward, 'max reward', max_reward)

  estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
  print('data per step avg', estimate)

  train_hparam_str = ('nlr{NU_LR}_zlr{Z_LR}_batch{BATCH_SIZE}_'
                      'gam{GAMMA}_nreg{NU_REG}_zreg{Z_REG}_algae{ALGAE_ALPHA}_'
                      'prim{PRIMAL}_div{DIV}').format(
                          NU_LR=nu_learning_rate,
                          Z_LR=zeta_learning_rate,
                          BATCH_SIZE=batch_size,
                          GAMMA=gamma,
                          NU_REG=nu_regularizer,
                          Z_REG=zeta_regularizer,
                          ALGAE_ALPHA=algae_alpha,
                          PRIMAL=primal_form,
                          DIV=divergence_limit)
  if save_dir is not None:
    save_dir = os.path.join(save_dir, hparam_str, train_hparam_str)
    summary_writer = tf.summary.create_file_writer(logdir=save_dir)
  else:
    summary_writer = tf.summary.create_noop_writer()

  activation_fn = tf.nn.relu
  kernel_initializer = tf.keras.initializers.TruncatedNormal(
      stddev=0.5, seed=1)
  hidden_dims = (64,)
  n_intervals = 1
  nu_network = ValueNetwork((dataset.spec.observation, dataset.spec.action),
                            fc_layer_params=hidden_dims,
                            activation_fn=activation_fn,
                            kernel_initializer=kernel_initializer,
                            last_kernel_initializer=None,
                            output_dim=2 * 2 * n_intervals)
  zeta_network = ValueNetwork((dataset.spec.observation, dataset.spec.action),
                              fc_layer_params=hidden_dims,
                              activation_fn=activation_fn,
                              kernel_initializer=kernel_initializer,
                              last_kernel_initializer=None,
                              output_dim=2 * 2 * n_intervals)
  weight_network = ValueNetwork((dataset.spec.observation,  # initial state
                                 dataset.spec.observation,  # cur state
                                 dataset.spec.action,       # cur action
                                 dataset.spec.observation), # next state
                                fc_layer_params=hidden_dims,
                                activation_fn=activation_fn,
                                kernel_initializer=kernel_initializer,
                                last_kernel_initializer=None,
                                output_dim=2 * n_intervals)

  nu_optimizer = tf.keras.optimizers.Adam(nu_learning_rate, beta_2=0.99)
  zeta_optimizer = tf.keras.optimizers.Adam(zeta_learning_rate, beta_2=0.99)
  weight_optimizer = tf.keras.optimizers.Adam(weight_learning_rate, beta_2=0.99)

  estimator = NeuralCoinDice(dataset.spec,
                             nu_network, zeta_network,
                             weight_network,
                             nu_optimizer, zeta_optimizer,
                             weight_optimizer,
                             gamma=gamma,
                             divergence_limit=divergence_limit,
                             f_exponent=f_exponent,
                             primal_form=primal_form,
                             nu_regularizer=nu_regularizer,
                             zeta_regularizer=zeta_regularizer,
                             algae_alpha=algae_alpha * np.array([1, 1]),
                             unbias_algae_alpha=False,
                             closed_form_weights=True,
                             num_samples=None)

  global_step = tf.Variable(0, dtype=tf.int64)
  tf.summary.experimental.set_step(global_step)

  @tf.function
  def one_step(transitions_batch, initial_steps_batch):
    global_step.assign_add(1)
    with tf.summary.record_if(tf.math.mod(global_step, 25) == 0):
      initial_steps_batch = tf.nest.map_structure(lambda t: t[:, 0, ...],
                                                  initial_steps_batch)
      losses, _ = estimator.train_step(initial_steps_batch, transitions_batch,
                                       target_policy)
    return losses

  with summary_writer.as_default():
    running_losses = []
    running_estimates = []
    for step in range(num_steps):

      transitions_batch = dataset.get_step(batch_size, num_steps=2)
      initial_steps_batch, _ = dataset.get_episode(
          batch_size, truncate_episode_at=1)
      losses = one_step(transitions_batch, initial_steps_batch)
      running_losses.append([t.numpy() for t in losses])

      if step % 500 == 0 or step == num_steps - 1:
        print('step', step, 'losses', np.mean(running_losses, 0))
        estimate = np.mean(running_losses, 0)[0]
        for idx, est in enumerate(estimate):
          tf.summary.scalar('estimate%d' % idx, est)
        running_estimates.append(estimate)
        print('estimated confidence interval %s' % estimate)
        print('avg last 3 estimated confidence interval %s' %
              np.mean(running_estimates[-3:], axis=0))
        running_losses = []

  if save_dir is not None:
    results_filename = os.path.join(save_dir, 'results.npy')
    with tf.io.gfile.GFile(results_filename, 'w') as f:
      np.save(f, running_estimates)
  print('Done!')
示例#7
0
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    learning_rate = FLAGS.learning_rate
    nstep_returns = FLAGS.nstep_returns
    num_steps = FLAGS.num_steps
    batch_size = FLAGS.batch_size

    target_policy = get_target_policy(load_dir, env_name, tabular_obs)

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)

    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)
    dataset = PerturbedDataset(dataset,
                               num_perturbations=10,
                               perturbation_scale=1.)
    #estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    #print('perturbed data per step avg', estimate)

    value_network = ValueNetwork(
        (dataset.spec.observation, dataset.spec.action),
        fc_layer_params=(64, 64),
        output_dim=10)
    optimizer = tf.keras.optimizers.Adam(learning_rate)

    estimator = NeuralQLearning(dataset.spec,
                                value_network,
                                optimizer,
                                gamma,
                                num_qvalues=10)
    for step in range(num_steps):
        batch = dataset.get_step(batch_size, num_steps=nstep_returns + 1)
        loss, _ = estimator.train_step(batch, target_policy)
        if step % 100 == 0 or step == num_steps - 1:
            print('step', step, 'loss', loss)
            estimate = estimator.estimate_average_reward(
                dataset, target_policy)
            print('estimated per step avg', estimate)

    print('Done!')
示例#8
0
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    num_trajectory_train = FLAGS.num_trajectory_train
    if num_trajectory_train is None:
        num_trajectory_train = num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    max_trajectory_length_train = FLAGS.max_trajectory_length_train
    if max_trajectory_length_train is None:
        max_trajectory_length_train = max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    nu_learning_rate = FLAGS.nu_learning_rate
    zeta_learning_rate = FLAGS.zeta_learning_rate
    nu_regularizer = FLAGS.nu_regularizer
    zeta_regularizer = FLAGS.zeta_regularizer
    f_exponent = FLAGS.f_exponent
    primal_form = FLAGS.primal_form
    batch_size = FLAGS.batch_size
    num_steps = FLAGS.num_steps
    save_dir = FLAGS.save_dir
    network_dir = os.path.join(save_dir, 'networks') if save_dir else None
    estimate_dir = os.path.join(save_dir, 'estimates') if save_dir else None

    target_policy = get_target_policy(load_dir, env_name, tabular_obs)

    hparam_base = '{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}'.format(
        ENV_NAME=env_name, TAB=tabular_obs, ALPHA=alpha, SEED=seed)

    hparam_data = hparam_base + '_numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}'.format(
        NUM_TRAJ=num_trajectory if num_steps == 0 else num_trajectory_train,
        MAX_TRAJ=max_trajectory_length
        if num_steps == 0 else max_trajectory_length_train)
    hparam_net = hparam_base + '_numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}'.format(
        NUM_TRAJ=num_trajectory_train, MAX_TRAJ=max_trajectory_length_train)
    hparam_result = hparam_base + '_numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}'.format(
        NUM_TRAJ=num_trajectory, MAX_TRAJ=max_trajectory_length)

    if estimate_dir is not None:
        if not tf.io.gfile.isdir(estimate_dir):
            tf.io.gfile.makedirs(estimate_dir)
        log_file = os.path.join(estimate_dir, hparam_result + '.log')
        print("Logging to '{0}'".format(log_file))
        sys.stdout = Logger(log_file)

    directory = os.path.join(load_dir, hparam_data)
    print('Loading dataset from', directory)
    dataset = Dataset.load(directory)
    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)

    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)

    activation_fn = tf.nn.tanh
    kernel_initializer = tf.keras.initializers.GlorotUniform()
    hidden_dims = (64, )
    step_encoding = None
    #step_encoding = 'one_hot'
    nu_network = StepValueNetwork(
        (dataset.spec.observation, dataset.spec.action, dataset.spec.step_num),
        fc_layer_params=hidden_dims,
        activation_fn=activation_fn,
        kernel_initializer=kernel_initializer,
        last_kernel_initializer=kernel_initializer,
        max_trajectory_length_train=max_trajectory_length_train,
        step_encoding=step_encoding)
    zeta_network = StepValueNetwork(
        (dataset.spec.observation, dataset.spec.action, dataset.spec.step_num),
        fc_layer_params=hidden_dims,
        activation_fn=activation_fn,
        kernel_initializer=kernel_initializer,
        last_kernel_initializer=kernel_initializer,
        max_trajectory_length_train=max_trajectory_length_train,
        step_encoding=step_encoding)
    nu_network.create_variables()
    zeta_network.create_variables()
    try:
        nu_network.load_weights(os.path.join(network_dir, hparam_net, 'nu'))
        zeta_network.load_weights(os.path.join(network_dir, hparam_net,
                                               'zeta'))
        print('loaded networks from', network_dir)
    except:
        print('initialized network from scratch')

    nu_optimizer = tf.keras.optimizers.Adam(nu_learning_rate)
    zeta_optimizer = tf.keras.optimizers.Adam(zeta_learning_rate)

    estimator = NeuralTeQDice(dataset.spec,
                              nu_network,
                              zeta_network,
                              nu_optimizer,
                              zeta_optimizer,
                              gamma,
                              f_exponent=f_exponent,
                              primal_form=primal_form,
                              nu_regularizer=nu_regularizer,
                              zeta_regularizer=zeta_regularizer)

    running_losses = []
    running_estimates = []
    for step in range(num_steps):
        transitions_batch = dataset.get_step(batch_size, num_steps=2)
        initial_steps_batch, _ = dataset.get_episode(batch_size,
                                                     truncate_episode_at=1)
        initial_steps_batch = tf.nest.map_structure(lambda t: t[:, 0, ...],
                                                    initial_steps_batch)
        losses = estimator.train_step(initial_steps_batch, transitions_batch,
                                      target_policy)
        running_losses.append(losses)
        if step % 500 == 0 or step == num_steps - 1:
            print('step', step, 'losses', np.mean(running_losses, 0))
            estimate = estimator.estimate_average_reward(
                dataset, target_policy)
            running_estimates.append(estimate)
            print('estimated per step avg %f' % estimate)
            print('avg last 3 estimated per step avg %f' %
                  np.mean(running_estimates[-3:]))
            if network_dir is not None:
                nu_network.save_weights(
                    os.path.join(network_dir, hparam_net, 'nu'))
                zeta_network.save_weights(
                    os.path.join(network_dir, hparam_net, 'zeta'))
                print('saved network weights to',
                      os.path.join(network_dir, hparam_net))
            running_losses = []

    if num_steps == 0:
        estimate = estimator.estimate_average_reward(dataset, target_policy)
        running_estimates.append(estimate)
        print('eval only per step avg %f' % np.mean(running_estimates[-3:]))

    if estimate_dir is not None:
        out_fname = os.path.join(estimate_dir, hparam_result + '.npy')
        print('Saving estimation results to', out_fname)
        with tf.io.gfile.GFile(out_fname, 'w') as f:
            np.save(f, running_estimates)

    print('Done!')
示例#9
0
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    save_dir = FLAGS.save_dir
    gamma = FLAGS.gamma
    num_steps = FLAGS.num_steps
    divergence_limit = FLAGS.divergence_limit
    algae_alpha = FLAGS.algae_alpha
    assert 0 <= gamma < 1.
    limit_episodes = FLAGS.limit_episodes

    target_policy = get_target_policy(load_dir, env_name, tabular_obs)

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)

    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)

    train_hparam_str = ('limit{LIMIT}_'
                        'gam{GAMMA}_algae{ALGAE_ALPHA}_div{DIV}').format(
                            LIMIT=limit_episodes,
                            GAMMA=gamma,
                            ALGAE_ALPHA=algae_alpha,
                            DIV=divergence_limit)

    if save_dir is not None:
        save_dir = os.path.join(save_dir, hparam_str, train_hparam_str)
        summary_writer = tf.summary.create_file_writer(logdir=save_dir)
    else:
        summary_writer = tf.summary.create_noop_writer()

    estimator = TabularCoinDice(dataset_spec=dataset.spec,
                                gamma=gamma,
                                divergence_limit=divergence_limit,
                                algae_alpha=algae_alpha * np.array([1, 1]),
                                limit_episodes=limit_episodes)
    estimator.prepare_dataset(dataset, target_policy)

    global_step = tf.Variable(0, dtype=tf.int64)
    tf.summary.experimental.set_step(global_step)
    with summary_writer.as_default():
        running_losses = []
        running_estimates = []
        for step in range(num_steps):
            loss = estimator.train_step(dataset, target_policy)
            running_losses.append(loss)
            global_step.assign_add(1)

            if step % 10 == 0 or step == num_steps - 1:
                print('step', step, 'losses', np.mean(running_losses, 0))
                estimate = np.mean(running_losses, 0)[0]
                for idx, est in enumerate(estimate):
                    tf.summary.scalar('estimate%d' % idx, est)
                running_estimates.append(estimate)
                print('estimated confidence interval %s' % estimate)
                print('avg last 3 estimated confidence interval %s' %
                      np.mean(running_estimates[-3:], axis=0))
                running_losses = []

    if save_dir is not None:
        results_filename = os.path.join(save_dir, 'results.npy')
        with tf.io.gfile.GFile(results_filename, 'w') as f:
            np.save(f, running_estimates)
    print('Done!')
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    load_dir = FLAGS.load_dir
    save_dir = FLAGS.save_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    alpha = FLAGS.alpha
    alpha_target = FLAGS.alpha_target

    num_steps = FLAGS.num_steps
    batch_size = FLAGS.batch_size
    zeta_learning_rate = FLAGS.zeta_learning_rate
    nu_learning_rate = FLAGS.nu_learning_rate
    solve_for_state_action_ratio = FLAGS.solve_for_state_action_ratio
    eps_std = FLAGS.eps_std
    kl_regularizer = FLAGS.kl_regularizer

    target_policy = get_target_policy(load_dir,
                                      env_name,
                                      tabular_obs,
                                      alpha=alpha_target)

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)

    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('behavior per-step',
          estimator_lib.get_fullbatch_average(dataset, gamma=gamma))

    train_hparam_str = ('eps{EPS}_kl{KL}').format(EPS=eps_std,
                                                  KL=kl_regularizer)

    if save_dir is not None:
        # Save for a specific alpha target
        target_hparam_str = hparam_str.replace(
            'alpha{}'.format(alpha),
            'alpha{}_alphat{}'.format(alpha, alpha_target))
        save_dir = os.path.join(save_dir, target_hparam_str, train_hparam_str)
        summary_writer = tf.summary.create_file_writer(logdir=save_dir)
    else:
        summary_writer = tf.summary.create_noop_writer()

    estimator = TabularBayesDice(
        dataset_spec=dataset.spec,
        gamma=gamma,
        solve_for_state_action_ratio=solve_for_state_action_ratio,
        zeta_learning_rate=zeta_learning_rate,
        nu_learning_rate=nu_learning_rate,
        kl_regularizer=kl_regularizer,
        eps_std=eps_std,
    )
    estimator.prepare_dataset(dataset, target_policy)

    global_step = tf.Variable(0, dtype=tf.int64)
    tf.summary.experimental.set_step(global_step)
    with summary_writer.as_default():
        running_losses = []
        running_estimates = []
        for step in range(num_steps):
            loss = estimator.train_step()[0]
            running_losses.append(loss)
            global_step.assign_add(1)

            if step % 500 == 0 or step == num_steps - 1:
                print('step', step, 'losses', np.mean(running_losses, 0))
                estimate = estimator.estimate_average_reward(
                    dataset, target_policy)
                tf.debugging.check_numerics(estimate, 'NaN in estimate')
                running_estimates.append(estimate)
                tf.print('est', tf.math.reduce_mean(estimate),
                         tf.math.reduce_std(estimate))

                running_losses = []

    if save_dir is not None:
        with tf.io.gfile.GFile(os.path.join(save_dir, 'results.npy'),
                               'w') as f:
            np.save(f, running_estimates)
        print('saved results to %s' % save_dir)

    print('Done!')
示例#11
0
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    save_dir = FLAGS.save_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    limit_episodes = FLAGS.limit_episodes

    target_policy = get_target_policy(load_dir, env_name, tabular_obs)

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)

    train_hparam_str = ('gamma{GAM}_limit{LIMIT}').format(GAM=gamma,
                                                          LIMIT=limit_episodes)

    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)

    estimator = TabularQLearning(
        dataset.spec,
        gamma,
        num_qvalues=200,
        perturbation_scale=[0.0, 0.01, 0.02, 0.05, 0.1, 0.2, 0.4, 1.],
        default_reward_value=0.0,
        limit_episodes=limit_episodes)
    estimate = estimator.solve(dataset, target_policy)
    print('estimated per step avg', estimate)

    if save_dir is not None:
        results_dir = os.path.join(save_dir, hparam_str)
        if not tf.io.gfile.exists(results_dir):
            tf.io.gfile.makedirs(results_dir)
        results_filename = os.path.join(results_dir,
                                        'results_%s.npy' % train_hparam_str)
        with tf.io.gfile.GFile(results_filename, 'w') as f:
            np.save(f, estimate)

    print('Done!')