예제 #1
0
def main(argv):
    load_dir = FLAGS.load_dir
    save_dir = FLAGS.save_dir
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    alpha_target = FLAGS.alpha_target
    gamma = FLAGS.gamma
    nu_learning_rate = FLAGS.nu_learning_rate
    zeta_learning_rate = FLAGS.zeta_learning_rate
    nu_regularizer = FLAGS.nu_regularizer
    zeta_regularizer = FLAGS.zeta_regularizer
    num_steps = FLAGS.num_steps
    batch_size = FLAGS.batch_size

    f_exponent = FLAGS.f_exponent
    primal_form = FLAGS.primal_form

    primal_regularizer = FLAGS.primal_regularizer
    dual_regularizer = FLAGS.dual_regularizer
    kl_regularizer = FLAGS.kl_regularizer
    zero_reward = FLAGS.zero_reward
    norm_regularizer = FLAGS.norm_regularizer
    zeta_pos = FLAGS.zeta_pos

    scale_reward = FLAGS.scale_reward
    shift_reward = FLAGS.shift_reward
    transform_reward = FLAGS.transform_reward

    kl_regularizer = FLAGS.kl_regularizer
    eps_std = FLAGS.eps_std

    def reward_fn(env_step):
        reward = env_step.reward * scale_reward + shift_reward
        if transform_reward is None:
            return reward
        if transform_reward == 'exp':
            reward = tf.math.exp(reward)
        elif transform_reward == 'cuberoot':
            reward = tf.sign(reward) * tf.math.pow(tf.abs(reward), 1.0 / 3.0)
        else:
            raise ValueError(
                'Reward {} not implemented.'.format(transform_reward))
        return reward

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    train_hparam_str = (
        'nlr{NLR}_zlr{ZLR}_zeror{ZEROR}_preg{PREG}_dreg{DREG}_kreg{KREG}_nreg{NREG}_'
        'pform{PFORM}_fexp{FEXP}_zpos{ZPOS}_'
        'scaler{SCALER}_shiftr{SHIFTR}_transr{TRANSR}').format(
            NLR=nu_learning_rate,
            ZLR=zeta_learning_rate,
            ZEROR=zero_reward,
            PREG=primal_regularizer,
            DREG=dual_regularizer,
            KREG=kl_regularizer,
            NREG=norm_regularizer,
            PFORM=primal_form,
            FEXP=f_exponent,
            ZPOS=zeta_pos,
            SCALER=scale_reward,
            SHIFTR=shift_reward,
            TRANSR=transform_reward,
        )

    train_hparam_str = ('eps{EPS}_kl{KL}').format(EPS=eps_std,
                                                  KL=kl_regularizer)

    if save_dir is not None:
        target_hparam_str = hparam_str.replace(
            'alpha{}'.format(alpha),
            'alpha{}_alphat{}'.format(alpha, alpha_target))
        save_dir = os.path.join(save_dir, target_hparam_str, train_hparam_str)
        summary_writer = tf.summary.create_file_writer(logdir=save_dir)
        summary_writer.set_as_default()
    else:
        tf.summary.create_noop_writer()

    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset from', directory)
    dataset = Dataset.load(directory)
    #dataset = Dataset.load(directory.replace('alpha{}'.format(alpha), 'alpha0.0'))

    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)
    print('behavior per-step',
          estimator_lib.get_fullbatch_average(dataset, gamma=gamma))

    activation_fn = tf.nn.relu
    kernel_initializer = tf.keras.initializers.GlorotUniform()
    hidden_dims = (64, 64)
    input_spec = (dataset.spec.observation, dataset.spec.action)
    nu_network = ValueNetwork(input_spec,
                              output_dim=2,
                              fc_layer_params=hidden_dims,
                              activation_fn=activation_fn,
                              kernel_initializer=kernel_initializer,
                              last_kernel_initializer=kernel_initializer)
    output_activation_fn = tf.math.square if zeta_pos else tf.identity
    zeta_network = ValueNetwork(input_spec,
                                output_dim=2,
                                fc_layer_params=hidden_dims,
                                activation_fn=activation_fn,
                                output_activation_fn=output_activation_fn,
                                kernel_initializer=kernel_initializer,
                                last_kernel_initializer=kernel_initializer)

    nu_optimizer = tf.keras.optimizers.Adam(nu_learning_rate)
    zeta_optimizer = tf.keras.optimizers.Adam(zeta_learning_rate)
    lam_optimizer = tf.keras.optimizers.Adam(nu_learning_rate)

    estimator = NeuralBayesDice(dataset.spec,
                                nu_network,
                                zeta_network,
                                nu_optimizer,
                                zeta_optimizer,
                                lam_optimizer,
                                gamma,
                                zero_reward=zero_reward,
                                f_exponent=f_exponent,
                                primal_form=primal_form,
                                reward_fn=reward_fn,
                                primal_regularizer=primal_regularizer,
                                dual_regularizer=dual_regularizer,
                                kl_regularizer=kl_regularizer,
                                eps_std=FLAGS.eps_std,
                                norm_regularizer=norm_regularizer,
                                nu_regularizer=nu_regularizer,
                                zeta_regularizer=zeta_regularizer)

    global_step = tf.Variable(0, dtype=tf.int64)
    tf.summary.experimental.set_step(global_step)

    target_policy = get_target_policy(load_dir, env_name, tabular_obs,
                                      alpha_target)
    running_losses = []
    all_dual = []
    for step in range(num_steps):
        transitions_batch = dataset.get_step(batch_size, num_steps=2)
        initial_steps_batch, _ = dataset.get_episode(batch_size,
                                                     truncate_episode_at=1)
        initial_steps_batch = tf.nest.map_structure(lambda t: t[:, 0, ...],
                                                    initial_steps_batch)
        losses = estimator.train_step(initial_steps_batch, transitions_batch,
                                      target_policy)
        running_losses.append(losses)
        if step % 500 == 0 or step == num_steps - 1:
            num_samples = 100
            dual_ests = []
            for i in range(num_samples):
                dual_est = estimator.estimate_average_reward(
                    dataset, target_policy, write_summary=(i == 0))
                dual_ests.append(dual_est)
            tf.summary.scalar('dual/mean', tf.math.reduce_mean(dual_ests))
            tf.summary.scalar('dual/std', tf.math.reduce_std(dual_ests))

            tf.print('dual/mean =', tf.math.reduce_mean(dual_ests),
                     'dual/std =', tf.math.reduce_std(dual_ests))

            all_dual.append(dual_ests)
            running_losses = []
        global_step.assign_add(1)

    if save_dir is not None:
        np.save(tf.io.gfile.GFile(os.path.join(save_dir, 'results.npy'), 'w'),
                all_dual)

    print('Done!')
예제 #2
0
def main(argv):
  env_name = FLAGS.env_name
  seed = FLAGS.seed
  tabular_obs = FLAGS.tabular_obs
  num_trajectory = FLAGS.num_trajectory
  max_trajectory_length = FLAGS.max_trajectory_length
  alpha = FLAGS.alpha
  load_dir = FLAGS.load_dir
  save_dir = FLAGS.save_dir
  policy_learning_rate = FLAGS.policy_learning_rate
  q_learning_rate = FLAGS.q_learning_rate
  batch_size = FLAGS.batch_size
  mode = FLAGS.mode
  ci_method = FLAGS.ci_method
  delta = FLAGS.delta
  delta_tail = 1 - delta
  gamma = FLAGS.gamma
  num_steps = FLAGS.num_steps
  use_trained_policy = FLAGS.use_trained_policy
  use_doubly_robust = FLAGS.use_doubly_robust
  assert 0 <= gamma < 1.

  target_policy = get_target_policy(load_dir, env_name, tabular_obs)

  hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                    ENV_NAME=env_name,
                    TAB=tabular_obs,
                    ALPHA=alpha,
                    SEED=seed,
                    NUM_TRAJ=num_trajectory,
                    MAX_TRAJ=max_trajectory_length)

  if FLAGS.num_trajectory_data is not None:
    hparam_str_data = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                       'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                           ENV_NAME=env_name,
                           TAB=tabular_obs,
                           ALPHA=alpha,
                           SEED=seed,
                           NUM_TRAJ=FLAGS.num_trajectory_data,
                           MAX_TRAJ=max_trajectory_length)
  else:
    hparam_str_data = hparam_str

  directory = os.path.join(load_dir, hparam_str_data)
  print('Loading dataset.')
  dataset = Dataset.load(directory)
  all_steps = dataset.get_all_steps()
  max_reward = tf.reduce_max(all_steps.reward)
  min_reward = tf.reduce_min(all_steps.reward)
  print('num loaded steps', dataset.num_steps)
  print('num loaded total steps', dataset.num_total_steps)
  print('num loaded episodes', dataset.num_episodes)
  print('num loaded total episodes', dataset.num_total_episodes)
  print('min reward', min_reward, 'max reward', max_reward)

  estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
  print('data per step avg', estimate)

  train_hparam_str = (
      'plr{P_LR}_tp{TRAINED_P}_batch{BATCH_SIZE}_mode{MODE}_CI{CI_METHOD}_UTP{USE_TRAINED_POLICY}_gam{GAMMA}_del{DELTA}'
  ).format(
      P_LR=policy_learning_rate,
      TRAINED_P=use_trained_policy,
      BATCH_SIZE=batch_size,
      MODE=mode,
      CI_METHOD=ci_method,
      USE_TRAINED_POLICY=use_trained_policy,
      GAMMA=gamma,
      DELTA=delta)

  if save_dir is not None:
    save_dir = os.path.join(save_dir, hparam_str, train_hparam_str)
    summary_writer = tf.summary.create_file_writer(logdir=save_dir)
  else:
    summary_writer = tf.summary.create_noop_writer()

  def non_negative_reward_translation(env_step):
    return env_step.reward - min_reward

  def inv_non_negative_estimate_translation(estimate):
    return estimate + min_reward

  if use_trained_policy:
    activation_fn = tf.nn.relu
    kernel_initializer = tf.keras.initializers.GlorotUniform()
    hidden_dims = (64, 64)
    policy_optimizer = tf.keras.optimizers.Adam(
        policy_learning_rate, beta_1=0.0, beta_2=0.0)
    policy_network = PolicyNetwork(
        dataset.spec.observation,
        dataset.spec.action,
        fc_layer_params=hidden_dims,
        activation_fn=activation_fn,
        kernel_initializer=kernel_initializer,
        last_kernel_initializer=kernel_initializer)
  else:
    policy_optimizer = None
    policy_network = None

  if use_doubly_robust:
    activation_fn = tf.nn.relu
    kernel_initializer = tf.keras.initializers.GlorotUniform()
    hidden_dims = (64, 64)
    q_optimizer = tf.keras.optimizers.Adam(q_learning_rate)
    q_network = ValueNetwork(
        (dataset.spec.observation, dataset.spec.action),
        fc_layer_params=hidden_dims,
        activation_fn=activation_fn,
        kernel_initializer=kernel_initializer,
        last_kernel_initializer=kernel_initializer)
  else:
    q_optimizer = None
    q_network = None

  estimator = ImportanceSamplingCI(
      dataset_spec=dataset.spec,
      policy_optimizer=policy_optimizer,
      policy_network=policy_network,
      mode=mode,
      ci_method=ci_method,
      delta_tail=delta_tail,
      gamma=gamma,
      reward_fn=non_negative_reward_translation,
      q_network=q_network,
      q_optimizer=q_optimizer)
  global_step = tf.Variable(0, dtype=tf.int64)
  tf.summary.experimental.set_step(global_step)

  # Following is for policy learning + IS confidence interval
  @tf.function
  def one_step(data_batch):
    global_step.assign_add(1)
    loss = estimator.train_step(data_batch, target_policy)
    return loss

  with summary_writer.as_default():
    running_losses = []
    running_estimates = []
    running_estimate_cis = []
    for step in range(num_steps):
      data_batch = dataset.get_step(batch_size, num_steps=2)
      loss = one_step(data_batch)
      running_losses.append(loss)

      if step % 500 == 0 or step == num_steps - 1:
        print('step', step, 'loss', np.mean(running_losses, 0))
        running_losses = []
        estimate = estimator.estimate_average_reward(
            dataset, target_policy, episode_limit=num_trajectory)
        estimate = inv_non_negative_estimate_translation(estimate)
        running_estimates.append(estimate)
        print('estimated per step avg %s' % estimate)
        print('avg last 3 estimated per step avg %s' %
              np.mean(running_estimates[-3:], axis=0))

        estimate_ci = estimator.estimate_reward_ci(dataset, target_policy)
        estimate_ci = np.array(
            [inv_non_negative_estimate_translation(ele) for ele in estimate_ci])
        running_estimate_cis.append(estimate_ci)
        print('estimated CI per step avg %s' % estimate_ci)
        print('avg last 3 estimated CI per step avg %s' %
              np.mean(running_estimate_cis[-3:], axis=0))

  if save_dir is not None:
    results_filename = os.path.join(save_dir, 'results.npy')
    with tf.io.gfile.GFile(results_filename, 'w') as f:
      np.save(f, running_estimate_cis)
  print('Done!')
예제 #3
0
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    learning_rate = FLAGS.learning_rate
    nstep_returns = FLAGS.nstep_returns
    num_steps = FLAGS.num_steps
    batch_size = FLAGS.batch_size

    target_policy = get_target_policy(load_dir, env_name, tabular_obs)

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)

    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)
    dataset = PerturbedDataset(dataset,
                               num_perturbations=10,
                               perturbation_scale=1.)
    #estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    #print('perturbed data per step avg', estimate)

    value_network = ValueNetwork(
        (dataset.spec.observation, dataset.spec.action),
        fc_layer_params=(64, 64),
        output_dim=10)
    optimizer = tf.keras.optimizers.Adam(learning_rate)

    estimator = NeuralQLearning(dataset.spec,
                                value_network,
                                optimizer,
                                gamma,
                                num_qvalues=10)
    for step in range(num_steps):
        batch = dataset.get_step(batch_size, num_steps=nstep_returns + 1)
        loss, _ = estimator.train_step(batch, target_policy)
        if step % 100 == 0 or step == num_steps - 1:
            print('step', step, 'loss', loss)
            estimate = estimator.estimate_average_reward(
                dataset, target_policy)
            print('estimated per step avg', estimate)

    print('Done!')
예제 #4
0
def main(argv):
  env_name = FLAGS.env_name
  seed = FLAGS.seed
  tabular_obs = FLAGS.tabular_obs
  num_trajectory = FLAGS.num_trajectory
  max_trajectory_length = FLAGS.max_trajectory_length
  alpha = FLAGS.alpha
  load_dir = FLAGS.load_dir
  save_dir = FLAGS.save_dir
  gamma = FLAGS.gamma
  assert 0 <= gamma < 1.
  nu_learning_rate = FLAGS.nu_learning_rate
  zeta_learning_rate = FLAGS.zeta_learning_rate
  nu_regularizer = FLAGS.nu_regularizer
  zeta_regularizer = FLAGS.zeta_regularizer
  weight_learning_rate = FLAGS.weight_learning_rate
  divergence_limit = FLAGS.divergence_limit
  algae_alpha = FLAGS.algae_alpha
  f_exponent = FLAGS.f_exponent
  primal_form = FLAGS.primal_form
  batch_size = FLAGS.batch_size
  num_steps = FLAGS.num_steps

  target_policy = get_target_policy(load_dir, env_name, tabular_obs)

  hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                    ENV_NAME=env_name,
                    TAB=tabular_obs,
                    ALPHA=alpha,
                    SEED=seed,
                    NUM_TRAJ=num_trajectory,
                    MAX_TRAJ=max_trajectory_length)
  directory = os.path.join(load_dir, hparam_str)
  print('Loading dataset.')
  dataset = Dataset.load(directory)
  all_steps = dataset.get_all_steps()
  max_reward = tf.reduce_max(all_steps.reward)
  min_reward = tf.reduce_min(all_steps.reward)
  print('num loaded steps', dataset.num_steps)
  print('num loaded total steps', dataset.num_total_steps)
  print('num loaded episodes', dataset.num_episodes)
  print('num loaded total episodes', dataset.num_total_episodes)
  print('min reward', min_reward, 'max reward', max_reward)

  estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
  print('data per step avg', estimate)

  train_hparam_str = ('nlr{NU_LR}_zlr{Z_LR}_batch{BATCH_SIZE}_'
                      'gam{GAMMA}_nreg{NU_REG}_zreg{Z_REG}_algae{ALGAE_ALPHA}_'
                      'prim{PRIMAL}_div{DIV}').format(
                          NU_LR=nu_learning_rate,
                          Z_LR=zeta_learning_rate,
                          BATCH_SIZE=batch_size,
                          GAMMA=gamma,
                          NU_REG=nu_regularizer,
                          Z_REG=zeta_regularizer,
                          ALGAE_ALPHA=algae_alpha,
                          PRIMAL=primal_form,
                          DIV=divergence_limit)
  if save_dir is not None:
    save_dir = os.path.join(save_dir, hparam_str, train_hparam_str)
    summary_writer = tf.summary.create_file_writer(logdir=save_dir)
  else:
    summary_writer = tf.summary.create_noop_writer()

  activation_fn = tf.nn.relu
  kernel_initializer = tf.keras.initializers.TruncatedNormal(
      stddev=0.5, seed=1)
  hidden_dims = (64,)
  n_intervals = 1
  nu_network = ValueNetwork((dataset.spec.observation, dataset.spec.action),
                            fc_layer_params=hidden_dims,
                            activation_fn=activation_fn,
                            kernel_initializer=kernel_initializer,
                            last_kernel_initializer=None,
                            output_dim=2 * 2 * n_intervals)
  zeta_network = ValueNetwork((dataset.spec.observation, dataset.spec.action),
                              fc_layer_params=hidden_dims,
                              activation_fn=activation_fn,
                              kernel_initializer=kernel_initializer,
                              last_kernel_initializer=None,
                              output_dim=2 * 2 * n_intervals)
  weight_network = ValueNetwork((dataset.spec.observation,  # initial state
                                 dataset.spec.observation,  # cur state
                                 dataset.spec.action,       # cur action
                                 dataset.spec.observation), # next state
                                fc_layer_params=hidden_dims,
                                activation_fn=activation_fn,
                                kernel_initializer=kernel_initializer,
                                last_kernel_initializer=None,
                                output_dim=2 * n_intervals)

  nu_optimizer = tf.keras.optimizers.Adam(nu_learning_rate, beta_2=0.99)
  zeta_optimizer = tf.keras.optimizers.Adam(zeta_learning_rate, beta_2=0.99)
  weight_optimizer = tf.keras.optimizers.Adam(weight_learning_rate, beta_2=0.99)

  estimator = NeuralCoinDice(dataset.spec,
                             nu_network, zeta_network,
                             weight_network,
                             nu_optimizer, zeta_optimizer,
                             weight_optimizer,
                             gamma=gamma,
                             divergence_limit=divergence_limit,
                             f_exponent=f_exponent,
                             primal_form=primal_form,
                             nu_regularizer=nu_regularizer,
                             zeta_regularizer=zeta_regularizer,
                             algae_alpha=algae_alpha * np.array([1, 1]),
                             unbias_algae_alpha=False,
                             closed_form_weights=True,
                             num_samples=None)

  global_step = tf.Variable(0, dtype=tf.int64)
  tf.summary.experimental.set_step(global_step)

  @tf.function
  def one_step(transitions_batch, initial_steps_batch):
    global_step.assign_add(1)
    with tf.summary.record_if(tf.math.mod(global_step, 25) == 0):
      initial_steps_batch = tf.nest.map_structure(lambda t: t[:, 0, ...],
                                                  initial_steps_batch)
      losses, _ = estimator.train_step(initial_steps_batch, transitions_batch,
                                       target_policy)
    return losses

  with summary_writer.as_default():
    running_losses = []
    running_estimates = []
    for step in range(num_steps):

      transitions_batch = dataset.get_step(batch_size, num_steps=2)
      initial_steps_batch, _ = dataset.get_episode(
          batch_size, truncate_episode_at=1)
      losses = one_step(transitions_batch, initial_steps_batch)
      running_losses.append([t.numpy() for t in losses])

      if step % 500 == 0 or step == num_steps - 1:
        print('step', step, 'losses', np.mean(running_losses, 0))
        estimate = np.mean(running_losses, 0)[0]
        for idx, est in enumerate(estimate):
          tf.summary.scalar('estimate%d' % idx, est)
        running_estimates.append(estimate)
        print('estimated confidence interval %s' % estimate)
        print('avg last 3 estimated confidence interval %s' %
              np.mean(running_estimates[-3:], axis=0))
        running_losses = []

  if save_dir is not None:
    results_filename = os.path.join(save_dir, 'results.npy')
    with tf.io.gfile.GFile(results_filename, 'w') as f:
      np.save(f, running_estimates)
  print('Done!')
예제 #5
0
def main(argv):
    env_name = FLAGS.env_name
    data_name = FLAGS.data_name
    seed = FLAGS.seed
    policy_load_dir = FLAGS.policy_load_dir
    data_load_dir = FLAGS.data_load_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    learning_rate = FLAGS.learning_rate
    nstep_returns = FLAGS.nstep_returns
    num_steps = FLAGS.num_steps
    batch_size = FLAGS.batch_size

    target_policy, env = get_target_policy(policy_load_dir, env_name)

    directory = os.path.join(data_load_dir,
                             'yifan_%s_%s' % (env_name, data_name))
    print('Loading dataset.')
    onpolicy_dataset = TFAgentsOnpolicyDataset(env, target_policy, 1000)
    write_dataset = TFOffpolicyDataset(onpolicy_dataset.spec)
    batch_size = 20
    num_trajectory = 10
    for batch_num in range(1 + (num_trajectory - 1) // batch_size):
        print(batch_num)
        num_trajectory_after_batch = min(num_trajectory,
                                         batch_size * (batch_num + 1))
        num_trajectory_to_get = num_trajectory_after_batch - batch_num * batch_size
        episodes, valid_steps = onpolicy_dataset.get_episode(
            batch_size=num_trajectory_to_get)
        add_episodes_to_dataset(episodes, valid_steps, write_dataset)
    dataset = write_dataset
    """
  dataset = Dataset.load(directory)
  """
    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)

    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)

    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)
    dataset = PerturbedDataset(
        dataset,
        num_perturbations=None,  #10,
        perturbation_scale=1.)

    value_network = ValueNetwork(
        (dataset.spec.observation, dataset.spec.action),
        fc_layer_params=(64, 64),
        output_dim=None)  #10)
    optimizer = tf.keras.optimizers.Adam(learning_rate)

    estimator = NeuralQLearning(
        dataset.spec,
        value_network,
        optimizer,
        gamma,
        num_qvalues=None,  #10,
        num_samples=1)
    for step in range(num_steps):
        batch = dataset.get_step(batch_size, num_steps=nstep_returns + 1)
        loss, _ = estimator.train_step(batch, target_policy)
        if step % 100 == 0 or step == num_steps - 1:
            print('step', step, 'loss', loss)
            estimate = estimator.estimate_average_reward(
                dataset, target_policy)
            print('estimated per step avg', estimate)

    print('Done!')