def main(argv): env_name = FLAGS.env_name seed = FLAGS.seed tabular_obs = FLAGS.tabular_obs num_trajectory = FLAGS.num_trajectory max_trajectory_length = FLAGS.max_trajectory_length alpha = FLAGS.alpha load_dir = FLAGS.load_dir gamma = FLAGS.gamma assert 0 <= gamma < 1. target_policy = get_target_policy(load_dir, env_name, tabular_obs) hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_' 'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format( ENV_NAME=env_name, TAB=tabular_obs, ALPHA=alpha, SEED=seed, NUM_TRAJ=num_trajectory, MAX_TRAJ=max_trajectory_length) directory = os.path.join(load_dir, hparam_str) print('Loading dataset.') dataset = Dataset.load(directory) print('num loaded steps', dataset.num_steps) print('num loaded total steps', dataset.num_total_steps) print('num loaded episodes', dataset.num_episodes) print('num loaded total episodes', dataset.num_total_episodes) estimator = TabularDualDice(dataset.spec, gamma) estimate = estimator.solve(dataset, target_policy) print('estimated per step avg', estimate)
def main(argv): env_name = FLAGS.env_name seed = FLAGS.seed tabular_obs = FLAGS.tabular_obs num_trajectory = FLAGS.num_trajectory max_trajectory_length = FLAGS.max_trajectory_length alpha = FLAGS.alpha load_dir = FLAGS.load_dir save_dir = FLAGS.save_dir step_encoding = FLAGS.step_encoding gamma = FLAGS.gamma assert 0 <= gamma < 1. max_trajectory_length_train = FLAGS.max_trajectory_length_train or max_trajectory_length target_policy = get_target_policy(load_dir, env_name, tabular_obs) hparam_base = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_' 'numtraj{NUM_TRAJ}').format(ENV_NAME=env_name, TAB=tabular_obs, ALPHA=alpha, SEED=seed, NUM_TRAJ=num_trajectory) hparam_data = hparam_base + '_maxtraj{MAX_TRAJ}'.format( MAX_TRAJ=max_trajectory_length_train) hparam_out = hparam_base + '_maxtraj{MAX_TRAJ}'.format( MAX_TRAJ=max_trajectory_length) directory = os.path.join(load_dir, hparam_data) print('Loading dataset.') dataset = Dataset.load(directory) print('num loaded steps', dataset.num_steps) print('num loaded total steps', dataset.num_total_steps) print('num loaded episodes', dataset.num_episodes) print('num loaded total episodes', dataset.num_total_episodes) estimator = TabularTeQDice(dataset.spec, gamma, max_trajectory_length, step_encoding) estimate = estimator.solve(dataset, target_policy) print('estimated per step avg', estimate) print('Done!') if save_dir is not None: if not tf.io.gfile.isdir(save_dir): tf.io.gfile.makedirs(save_dir) out_fname = os.path.join( save_dir, hparam_out + '_enc{ENC}.npy'.format(ENC=step_encoding)) print('Saving results to', out_fname) with tf.io.gfile.GFile(out_fname, 'w') as f: np.save(f, estimate.numpy())
def main(argv): load_dir = FLAGS.load_dir save_dir = FLAGS.save_dir env_name = FLAGS.env_name seed = FLAGS.seed tabular_obs = FLAGS.tabular_obs num_trajectory = FLAGS.num_trajectory max_trajectory_length = FLAGS.max_trajectory_length alpha = FLAGS.alpha alpha_target = FLAGS.alpha_target gamma = FLAGS.gamma nu_learning_rate = FLAGS.nu_learning_rate zeta_learning_rate = FLAGS.zeta_learning_rate nu_regularizer = FLAGS.nu_regularizer zeta_regularizer = FLAGS.zeta_regularizer num_steps = FLAGS.num_steps batch_size = FLAGS.batch_size f_exponent = FLAGS.f_exponent primal_form = FLAGS.primal_form primal_regularizer = FLAGS.primal_regularizer dual_regularizer = FLAGS.dual_regularizer kl_regularizer = FLAGS.kl_regularizer zero_reward = FLAGS.zero_reward norm_regularizer = FLAGS.norm_regularizer zeta_pos = FLAGS.zeta_pos scale_reward = FLAGS.scale_reward shift_reward = FLAGS.shift_reward transform_reward = FLAGS.transform_reward kl_regularizer = FLAGS.kl_regularizer eps_std = FLAGS.eps_std def reward_fn(env_step): reward = env_step.reward * scale_reward + shift_reward if transform_reward is None: return reward if transform_reward == 'exp': reward = tf.math.exp(reward) elif transform_reward == 'cuberoot': reward = tf.sign(reward) * tf.math.pow(tf.abs(reward), 1.0 / 3.0) else: raise ValueError( 'Reward {} not implemented.'.format(transform_reward)) return reward hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_' 'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format( ENV_NAME=env_name, TAB=tabular_obs, ALPHA=alpha, SEED=seed, NUM_TRAJ=num_trajectory, MAX_TRAJ=max_trajectory_length) train_hparam_str = ( 'nlr{NLR}_zlr{ZLR}_zeror{ZEROR}_preg{PREG}_dreg{DREG}_kreg{KREG}_nreg{NREG}_' 'pform{PFORM}_fexp{FEXP}_zpos{ZPOS}_' 'scaler{SCALER}_shiftr{SHIFTR}_transr{TRANSR}').format( NLR=nu_learning_rate, ZLR=zeta_learning_rate, ZEROR=zero_reward, PREG=primal_regularizer, DREG=dual_regularizer, KREG=kl_regularizer, NREG=norm_regularizer, PFORM=primal_form, FEXP=f_exponent, ZPOS=zeta_pos, SCALER=scale_reward, SHIFTR=shift_reward, TRANSR=transform_reward, ) train_hparam_str = ('eps{EPS}_kl{KL}').format(EPS=eps_std, KL=kl_regularizer) if save_dir is not None: target_hparam_str = hparam_str.replace( 'alpha{}'.format(alpha), 'alpha{}_alphat{}'.format(alpha, alpha_target)) save_dir = os.path.join(save_dir, target_hparam_str, train_hparam_str) summary_writer = tf.summary.create_file_writer(logdir=save_dir) summary_writer.set_as_default() else: tf.summary.create_noop_writer() directory = os.path.join(load_dir, hparam_str) print('Loading dataset from', directory) dataset = Dataset.load(directory) #dataset = Dataset.load(directory.replace('alpha{}'.format(alpha), 'alpha0.0')) all_steps = dataset.get_all_steps() max_reward = tf.reduce_max(all_steps.reward) min_reward = tf.reduce_min(all_steps.reward) print('num loaded steps', dataset.num_steps) print('num loaded total steps', dataset.num_total_steps) print('num loaded episodes', dataset.num_episodes) print('num loaded total episodes', dataset.num_total_episodes) print('min reward', min_reward, 'max reward', max_reward) print('behavior per-step', estimator_lib.get_fullbatch_average(dataset, gamma=gamma)) activation_fn = tf.nn.relu kernel_initializer = tf.keras.initializers.GlorotUniform() hidden_dims = (64, 64) input_spec = (dataset.spec.observation, dataset.spec.action) nu_network = ValueNetwork(input_spec, output_dim=2, fc_layer_params=hidden_dims, activation_fn=activation_fn, kernel_initializer=kernel_initializer, last_kernel_initializer=kernel_initializer) output_activation_fn = tf.math.square if zeta_pos else tf.identity zeta_network = ValueNetwork(input_spec, output_dim=2, fc_layer_params=hidden_dims, activation_fn=activation_fn, output_activation_fn=output_activation_fn, kernel_initializer=kernel_initializer, last_kernel_initializer=kernel_initializer) nu_optimizer = tf.keras.optimizers.Adam(nu_learning_rate) zeta_optimizer = tf.keras.optimizers.Adam(zeta_learning_rate) lam_optimizer = tf.keras.optimizers.Adam(nu_learning_rate) estimator = NeuralBayesDice(dataset.spec, nu_network, zeta_network, nu_optimizer, zeta_optimizer, lam_optimizer, gamma, zero_reward=zero_reward, f_exponent=f_exponent, primal_form=primal_form, reward_fn=reward_fn, primal_regularizer=primal_regularizer, dual_regularizer=dual_regularizer, kl_regularizer=kl_regularizer, eps_std=FLAGS.eps_std, norm_regularizer=norm_regularizer, nu_regularizer=nu_regularizer, zeta_regularizer=zeta_regularizer) global_step = tf.Variable(0, dtype=tf.int64) tf.summary.experimental.set_step(global_step) target_policy = get_target_policy(load_dir, env_name, tabular_obs, alpha_target) running_losses = [] all_dual = [] for step in range(num_steps): transitions_batch = dataset.get_step(batch_size, num_steps=2) initial_steps_batch, _ = dataset.get_episode(batch_size, truncate_episode_at=1) initial_steps_batch = tf.nest.map_structure(lambda t: t[:, 0, ...], initial_steps_batch) losses = estimator.train_step(initial_steps_batch, transitions_batch, target_policy) running_losses.append(losses) if step % 500 == 0 or step == num_steps - 1: num_samples = 100 dual_ests = [] for i in range(num_samples): dual_est = estimator.estimate_average_reward( dataset, target_policy, write_summary=(i == 0)) dual_ests.append(dual_est) tf.summary.scalar('dual/mean', tf.math.reduce_mean(dual_ests)) tf.summary.scalar('dual/std', tf.math.reduce_std(dual_ests)) tf.print('dual/mean =', tf.math.reduce_mean(dual_ests), 'dual/std =', tf.math.reduce_std(dual_ests)) all_dual.append(dual_ests) running_losses = [] global_step.assign_add(1) if save_dir is not None: np.save(tf.io.gfile.GFile(os.path.join(save_dir, 'results.npy'), 'w'), all_dual) print('Done!')
def main(argv): env_name = FLAGS.env_name seed = FLAGS.seed tabular_obs = FLAGS.tabular_obs num_trajectory = FLAGS.num_trajectory max_trajectory_length = FLAGS.max_trajectory_length alpha = FLAGS.alpha load_dir = FLAGS.load_dir save_dir = FLAGS.save_dir gamma = FLAGS.gamma num_steps = FLAGS.num_steps divergence_limit = FLAGS.divergence_limit algae_alpha = FLAGS.algae_alpha alpha_learning_rate = FLAGS.alpha_learning_rate train_nu_zeta_per_steps = FLAGS.train_nu_zeta_per_steps assert 0 <= gamma < 1. limit_episodes = FLAGS.limit_episodes target_policy = get_target_policy(load_dir, env_name, tabular_obs) hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_' 'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format( ENV_NAME=env_name, TAB=tabular_obs, ALPHA=alpha, SEED=seed, NUM_TRAJ=num_trajectory, MAX_TRAJ=max_trajectory_length) directory = os.path.join(load_dir, hparam_str) print('Loading dataset.') dataset = Dataset.load(directory) all_steps = dataset.get_all_steps() max_reward = tf.reduce_max(all_steps.reward) min_reward = tf.reduce_min(all_steps.reward) print('num loaded steps', dataset.num_steps) print('num loaded total steps', dataset.num_total_steps) print('num loaded episodes', dataset.num_episodes) print('num loaded total episodes', dataset.num_total_episodes) print('min reward', min_reward, 'max reward', max_reward) estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma) print('data per step avg', estimate) train_hparam_str = ('alr{A_LR}_tnzs{TNZS}_limit{LIMIT}_' 'gam{GAMMA}_algae{ALGAE_ALPHA}_div{DIV}').format( A_LR=alpha_learning_rate, TNZS=train_nu_zeta_per_steps, LIMIT=limit_episodes, GAMMA=gamma, ALGAE_ALPHA=algae_alpha, DIV=divergence_limit) if save_dir is not None: save_dir = os.path.join(save_dir, hparam_str, train_hparam_str) summary_writer = tf.summary.create_file_writer(logdir=save_dir) else: summary_writer = tf.summary.create_noop_writer() alpha_optimizer = tf.keras.optimizers.Adam(alpha_learning_rate, beta_1=0.0, beta_2=0.0) episodes, valid_steps = dataset.get_all_episodes(limit=limit_episodes) num_samples = tf.reduce_sum( tf.cast( tf.logical_and(valid_steps, episodes.discount > 0)[:, :-1], tf.float32)) estimator = TabularRobustDice( dataset_spec=dataset.spec, alpha_optimizer=alpha_optimizer, gamma=gamma, divergence_limit= #divergence_limit, divergence_limit / num_samples, algae_alpha=algae_alpha * np.array([1, 1]), limit_episodes=limit_episodes) global_step = tf.Variable(0, dtype=tf.int64) tf.summary.experimental.set_step(global_step) def one_step(transitions_batch, initial_steps_batch, target_policy): global_step.assign_add(1) #initial_steps_batch = tf.nest.map_structure(lambda t: t[:, 0, ...], # initial_steps_batch) #losses, _ = estimator.train_alpha(initial_steps_batch, transitions_batch, # target_policy) #return losses with summary_writer.as_default(): running_losses = [] running_estimates = [] for step in range(num_steps): if step % train_nu_zeta_per_steps == 0: # first solve for the primal nu_loss, print('Step: {}. Solve for an updated tabular nu/zeta.'.format( step)) loss = estimator.solve_nu_zeta(dataset, target_policy) running_losses.append(loss) one_step(None, None, None) if step % 500 == 0 or step == num_steps - 1: print('step', step, 'losses', np.mean(running_losses, 0)) estimate = np.mean(running_losses, 0)[0] for idx, est in enumerate(estimate): tf.summary.scalar('estimate%d' % idx, est) running_estimates.append(estimate) print('estimated per step avg %s' % estimate) print('avg last 3 estimated per step avg %s' % np.mean(running_estimates[-3:], axis=0)) running_losses = [] if save_dir is not None: results_filename = os.path.join(save_dir, 'results.npy') with tf.io.gfile.GFile(results_filename, 'w') as f: np.save(f, running_estimates) print('Done!')
def main(argv): env_name = FLAGS.env_name seed = FLAGS.seed tabular_obs = FLAGS.tabular_obs num_trajectory = FLAGS.num_trajectory max_trajectory_length = FLAGS.max_trajectory_length alpha = FLAGS.alpha load_dir = FLAGS.load_dir save_dir = FLAGS.save_dir policy_learning_rate = FLAGS.policy_learning_rate q_learning_rate = FLAGS.q_learning_rate batch_size = FLAGS.batch_size mode = FLAGS.mode ci_method = FLAGS.ci_method delta = FLAGS.delta delta_tail = 1 - delta gamma = FLAGS.gamma num_steps = FLAGS.num_steps use_trained_policy = FLAGS.use_trained_policy use_doubly_robust = FLAGS.use_doubly_robust assert 0 <= gamma < 1. target_policy = get_target_policy(load_dir, env_name, tabular_obs) hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_' 'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format( ENV_NAME=env_name, TAB=tabular_obs, ALPHA=alpha, SEED=seed, NUM_TRAJ=num_trajectory, MAX_TRAJ=max_trajectory_length) if FLAGS.num_trajectory_data is not None: hparam_str_data = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_' 'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format( ENV_NAME=env_name, TAB=tabular_obs, ALPHA=alpha, SEED=seed, NUM_TRAJ=FLAGS.num_trajectory_data, MAX_TRAJ=max_trajectory_length) else: hparam_str_data = hparam_str directory = os.path.join(load_dir, hparam_str_data) print('Loading dataset.') dataset = Dataset.load(directory) all_steps = dataset.get_all_steps() max_reward = tf.reduce_max(all_steps.reward) min_reward = tf.reduce_min(all_steps.reward) print('num loaded steps', dataset.num_steps) print('num loaded total steps', dataset.num_total_steps) print('num loaded episodes', dataset.num_episodes) print('num loaded total episodes', dataset.num_total_episodes) print('min reward', min_reward, 'max reward', max_reward) estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma) print('data per step avg', estimate) train_hparam_str = ( 'plr{P_LR}_tp{TRAINED_P}_batch{BATCH_SIZE}_mode{MODE}_CI{CI_METHOD}_UTP{USE_TRAINED_POLICY}_gam{GAMMA}_del{DELTA}' ).format( P_LR=policy_learning_rate, TRAINED_P=use_trained_policy, BATCH_SIZE=batch_size, MODE=mode, CI_METHOD=ci_method, USE_TRAINED_POLICY=use_trained_policy, GAMMA=gamma, DELTA=delta) if save_dir is not None: save_dir = os.path.join(save_dir, hparam_str, train_hparam_str) summary_writer = tf.summary.create_file_writer(logdir=save_dir) else: summary_writer = tf.summary.create_noop_writer() def non_negative_reward_translation(env_step): return env_step.reward - min_reward def inv_non_negative_estimate_translation(estimate): return estimate + min_reward if use_trained_policy: activation_fn = tf.nn.relu kernel_initializer = tf.keras.initializers.GlorotUniform() hidden_dims = (64, 64) policy_optimizer = tf.keras.optimizers.Adam( policy_learning_rate, beta_1=0.0, beta_2=0.0) policy_network = PolicyNetwork( dataset.spec.observation, dataset.spec.action, fc_layer_params=hidden_dims, activation_fn=activation_fn, kernel_initializer=kernel_initializer, last_kernel_initializer=kernel_initializer) else: policy_optimizer = None policy_network = None if use_doubly_robust: activation_fn = tf.nn.relu kernel_initializer = tf.keras.initializers.GlorotUniform() hidden_dims = (64, 64) q_optimizer = tf.keras.optimizers.Adam(q_learning_rate) q_network = ValueNetwork( (dataset.spec.observation, dataset.spec.action), fc_layer_params=hidden_dims, activation_fn=activation_fn, kernel_initializer=kernel_initializer, last_kernel_initializer=kernel_initializer) else: q_optimizer = None q_network = None estimator = ImportanceSamplingCI( dataset_spec=dataset.spec, policy_optimizer=policy_optimizer, policy_network=policy_network, mode=mode, ci_method=ci_method, delta_tail=delta_tail, gamma=gamma, reward_fn=non_negative_reward_translation, q_network=q_network, q_optimizer=q_optimizer) global_step = tf.Variable(0, dtype=tf.int64) tf.summary.experimental.set_step(global_step) # Following is for policy learning + IS confidence interval @tf.function def one_step(data_batch): global_step.assign_add(1) loss = estimator.train_step(data_batch, target_policy) return loss with summary_writer.as_default(): running_losses = [] running_estimates = [] running_estimate_cis = [] for step in range(num_steps): data_batch = dataset.get_step(batch_size, num_steps=2) loss = one_step(data_batch) running_losses.append(loss) if step % 500 == 0 or step == num_steps - 1: print('step', step, 'loss', np.mean(running_losses, 0)) running_losses = [] estimate = estimator.estimate_average_reward( dataset, target_policy, episode_limit=num_trajectory) estimate = inv_non_negative_estimate_translation(estimate) running_estimates.append(estimate) print('estimated per step avg %s' % estimate) print('avg last 3 estimated per step avg %s' % np.mean(running_estimates[-3:], axis=0)) estimate_ci = estimator.estimate_reward_ci(dataset, target_policy) estimate_ci = np.array( [inv_non_negative_estimate_translation(ele) for ele in estimate_ci]) running_estimate_cis.append(estimate_ci) print('estimated CI per step avg %s' % estimate_ci) print('avg last 3 estimated CI per step avg %s' % np.mean(running_estimate_cis[-3:], axis=0)) if save_dir is not None: results_filename = os.path.join(save_dir, 'results.npy') with tf.io.gfile.GFile(results_filename, 'w') as f: np.save(f, running_estimate_cis) print('Done!')
def main(argv): env_name = FLAGS.env_name seed = FLAGS.seed tabular_obs = FLAGS.tabular_obs num_trajectory = FLAGS.num_trajectory max_trajectory_length = FLAGS.max_trajectory_length alpha = FLAGS.alpha load_dir = FLAGS.load_dir save_dir = FLAGS.save_dir gamma = FLAGS.gamma assert 0 <= gamma < 1. nu_learning_rate = FLAGS.nu_learning_rate zeta_learning_rate = FLAGS.zeta_learning_rate nu_regularizer = FLAGS.nu_regularizer zeta_regularizer = FLAGS.zeta_regularizer weight_learning_rate = FLAGS.weight_learning_rate divergence_limit = FLAGS.divergence_limit algae_alpha = FLAGS.algae_alpha f_exponent = FLAGS.f_exponent primal_form = FLAGS.primal_form batch_size = FLAGS.batch_size num_steps = FLAGS.num_steps target_policy = get_target_policy(load_dir, env_name, tabular_obs) hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_' 'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format( ENV_NAME=env_name, TAB=tabular_obs, ALPHA=alpha, SEED=seed, NUM_TRAJ=num_trajectory, MAX_TRAJ=max_trajectory_length) directory = os.path.join(load_dir, hparam_str) print('Loading dataset.') dataset = Dataset.load(directory) all_steps = dataset.get_all_steps() max_reward = tf.reduce_max(all_steps.reward) min_reward = tf.reduce_min(all_steps.reward) print('num loaded steps', dataset.num_steps) print('num loaded total steps', dataset.num_total_steps) print('num loaded episodes', dataset.num_episodes) print('num loaded total episodes', dataset.num_total_episodes) print('min reward', min_reward, 'max reward', max_reward) estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma) print('data per step avg', estimate) train_hparam_str = ('nlr{NU_LR}_zlr{Z_LR}_batch{BATCH_SIZE}_' 'gam{GAMMA}_nreg{NU_REG}_zreg{Z_REG}_algae{ALGAE_ALPHA}_' 'prim{PRIMAL}_div{DIV}').format( NU_LR=nu_learning_rate, Z_LR=zeta_learning_rate, BATCH_SIZE=batch_size, GAMMA=gamma, NU_REG=nu_regularizer, Z_REG=zeta_regularizer, ALGAE_ALPHA=algae_alpha, PRIMAL=primal_form, DIV=divergence_limit) if save_dir is not None: save_dir = os.path.join(save_dir, hparam_str, train_hparam_str) summary_writer = tf.summary.create_file_writer(logdir=save_dir) else: summary_writer = tf.summary.create_noop_writer() activation_fn = tf.nn.relu kernel_initializer = tf.keras.initializers.TruncatedNormal( stddev=0.5, seed=1) hidden_dims = (64,) n_intervals = 1 nu_network = ValueNetwork((dataset.spec.observation, dataset.spec.action), fc_layer_params=hidden_dims, activation_fn=activation_fn, kernel_initializer=kernel_initializer, last_kernel_initializer=None, output_dim=2 * 2 * n_intervals) zeta_network = ValueNetwork((dataset.spec.observation, dataset.spec.action), fc_layer_params=hidden_dims, activation_fn=activation_fn, kernel_initializer=kernel_initializer, last_kernel_initializer=None, output_dim=2 * 2 * n_intervals) weight_network = ValueNetwork((dataset.spec.observation, # initial state dataset.spec.observation, # cur state dataset.spec.action, # cur action dataset.spec.observation), # next state fc_layer_params=hidden_dims, activation_fn=activation_fn, kernel_initializer=kernel_initializer, last_kernel_initializer=None, output_dim=2 * n_intervals) nu_optimizer = tf.keras.optimizers.Adam(nu_learning_rate, beta_2=0.99) zeta_optimizer = tf.keras.optimizers.Adam(zeta_learning_rate, beta_2=0.99) weight_optimizer = tf.keras.optimizers.Adam(weight_learning_rate, beta_2=0.99) estimator = NeuralCoinDice(dataset.spec, nu_network, zeta_network, weight_network, nu_optimizer, zeta_optimizer, weight_optimizer, gamma=gamma, divergence_limit=divergence_limit, f_exponent=f_exponent, primal_form=primal_form, nu_regularizer=nu_regularizer, zeta_regularizer=zeta_regularizer, algae_alpha=algae_alpha * np.array([1, 1]), unbias_algae_alpha=False, closed_form_weights=True, num_samples=None) global_step = tf.Variable(0, dtype=tf.int64) tf.summary.experimental.set_step(global_step) @tf.function def one_step(transitions_batch, initial_steps_batch): global_step.assign_add(1) with tf.summary.record_if(tf.math.mod(global_step, 25) == 0): initial_steps_batch = tf.nest.map_structure(lambda t: t[:, 0, ...], initial_steps_batch) losses, _ = estimator.train_step(initial_steps_batch, transitions_batch, target_policy) return losses with summary_writer.as_default(): running_losses = [] running_estimates = [] for step in range(num_steps): transitions_batch = dataset.get_step(batch_size, num_steps=2) initial_steps_batch, _ = dataset.get_episode( batch_size, truncate_episode_at=1) losses = one_step(transitions_batch, initial_steps_batch) running_losses.append([t.numpy() for t in losses]) if step % 500 == 0 or step == num_steps - 1: print('step', step, 'losses', np.mean(running_losses, 0)) estimate = np.mean(running_losses, 0)[0] for idx, est in enumerate(estimate): tf.summary.scalar('estimate%d' % idx, est) running_estimates.append(estimate) print('estimated confidence interval %s' % estimate) print('avg last 3 estimated confidence interval %s' % np.mean(running_estimates[-3:], axis=0)) running_losses = [] if save_dir is not None: results_filename = os.path.join(save_dir, 'results.npy') with tf.io.gfile.GFile(results_filename, 'w') as f: np.save(f, running_estimates) print('Done!')
def main(argv): env_name = FLAGS.env_name seed = FLAGS.seed tabular_obs = FLAGS.tabular_obs num_trajectory = FLAGS.num_trajectory max_trajectory_length = FLAGS.max_trajectory_length alpha = FLAGS.alpha load_dir = FLAGS.load_dir gamma = FLAGS.gamma assert 0 <= gamma < 1. learning_rate = FLAGS.learning_rate nstep_returns = FLAGS.nstep_returns num_steps = FLAGS.num_steps batch_size = FLAGS.batch_size target_policy = get_target_policy(load_dir, env_name, tabular_obs) hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_' 'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format( ENV_NAME=env_name, TAB=tabular_obs, ALPHA=alpha, SEED=seed, NUM_TRAJ=num_trajectory, MAX_TRAJ=max_trajectory_length) directory = os.path.join(load_dir, hparam_str) print('Loading dataset.') dataset = Dataset.load(directory) all_steps = dataset.get_all_steps() max_reward = tf.reduce_max(all_steps.reward) min_reward = tf.reduce_min(all_steps.reward) print('num loaded steps', dataset.num_steps) print('num loaded total steps', dataset.num_total_steps) print('num loaded episodes', dataset.num_episodes) print('num loaded total episodes', dataset.num_total_episodes) print('min reward', min_reward, 'max reward', max_reward) estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma) print('data per step avg', estimate) dataset = PerturbedDataset(dataset, num_perturbations=10, perturbation_scale=1.) #estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma) #print('perturbed data per step avg', estimate) value_network = ValueNetwork( (dataset.spec.observation, dataset.spec.action), fc_layer_params=(64, 64), output_dim=10) optimizer = tf.keras.optimizers.Adam(learning_rate) estimator = NeuralQLearning(dataset.spec, value_network, optimizer, gamma, num_qvalues=10) for step in range(num_steps): batch = dataset.get_step(batch_size, num_steps=nstep_returns + 1) loss, _ = estimator.train_step(batch, target_policy) if step % 100 == 0 or step == num_steps - 1: print('step', step, 'loss', loss) estimate = estimator.estimate_average_reward( dataset, target_policy) print('estimated per step avg', estimate) print('Done!')
def main(argv): env_name = FLAGS.env_name seed = FLAGS.seed tabular_obs = FLAGS.tabular_obs num_trajectory = FLAGS.num_trajectory num_trajectory_train = FLAGS.num_trajectory_train if num_trajectory_train is None: num_trajectory_train = num_trajectory max_trajectory_length = FLAGS.max_trajectory_length max_trajectory_length_train = FLAGS.max_trajectory_length_train if max_trajectory_length_train is None: max_trajectory_length_train = max_trajectory_length alpha = FLAGS.alpha load_dir = FLAGS.load_dir gamma = FLAGS.gamma assert 0 <= gamma < 1. nu_learning_rate = FLAGS.nu_learning_rate zeta_learning_rate = FLAGS.zeta_learning_rate nu_regularizer = FLAGS.nu_regularizer zeta_regularizer = FLAGS.zeta_regularizer f_exponent = FLAGS.f_exponent primal_form = FLAGS.primal_form batch_size = FLAGS.batch_size num_steps = FLAGS.num_steps save_dir = FLAGS.save_dir network_dir = os.path.join(save_dir, 'networks') if save_dir else None estimate_dir = os.path.join(save_dir, 'estimates') if save_dir else None target_policy = get_target_policy(load_dir, env_name, tabular_obs) hparam_base = '{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}'.format( ENV_NAME=env_name, TAB=tabular_obs, ALPHA=alpha, SEED=seed) hparam_data = hparam_base + '_numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}'.format( NUM_TRAJ=num_trajectory if num_steps == 0 else num_trajectory_train, MAX_TRAJ=max_trajectory_length if num_steps == 0 else max_trajectory_length_train) hparam_net = hparam_base + '_numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}'.format( NUM_TRAJ=num_trajectory_train, MAX_TRAJ=max_trajectory_length_train) hparam_result = hparam_base + '_numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}'.format( NUM_TRAJ=num_trajectory, MAX_TRAJ=max_trajectory_length) if estimate_dir is not None: if not tf.io.gfile.isdir(estimate_dir): tf.io.gfile.makedirs(estimate_dir) log_file = os.path.join(estimate_dir, hparam_result + '.log') print("Logging to '{0}'".format(log_file)) sys.stdout = Logger(log_file) directory = os.path.join(load_dir, hparam_data) print('Loading dataset from', directory) dataset = Dataset.load(directory) all_steps = dataset.get_all_steps() max_reward = tf.reduce_max(all_steps.reward) min_reward = tf.reduce_min(all_steps.reward) print('num loaded steps', dataset.num_steps) print('num loaded total steps', dataset.num_total_steps) print('num loaded episodes', dataset.num_episodes) print('num loaded total episodes', dataset.num_total_episodes) print('min reward', min_reward, 'max reward', max_reward) estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma) print('data per step avg', estimate) activation_fn = tf.nn.tanh kernel_initializer = tf.keras.initializers.GlorotUniform() hidden_dims = (64, ) step_encoding = None #step_encoding = 'one_hot' nu_network = StepValueNetwork( (dataset.spec.observation, dataset.spec.action, dataset.spec.step_num), fc_layer_params=hidden_dims, activation_fn=activation_fn, kernel_initializer=kernel_initializer, last_kernel_initializer=kernel_initializer, max_trajectory_length_train=max_trajectory_length_train, step_encoding=step_encoding) zeta_network = StepValueNetwork( (dataset.spec.observation, dataset.spec.action, dataset.spec.step_num), fc_layer_params=hidden_dims, activation_fn=activation_fn, kernel_initializer=kernel_initializer, last_kernel_initializer=kernel_initializer, max_trajectory_length_train=max_trajectory_length_train, step_encoding=step_encoding) nu_network.create_variables() zeta_network.create_variables() try: nu_network.load_weights(os.path.join(network_dir, hparam_net, 'nu')) zeta_network.load_weights(os.path.join(network_dir, hparam_net, 'zeta')) print('loaded networks from', network_dir) except: print('initialized network from scratch') nu_optimizer = tf.keras.optimizers.Adam(nu_learning_rate) zeta_optimizer = tf.keras.optimizers.Adam(zeta_learning_rate) estimator = NeuralTeQDice(dataset.spec, nu_network, zeta_network, nu_optimizer, zeta_optimizer, gamma, f_exponent=f_exponent, primal_form=primal_form, nu_regularizer=nu_regularizer, zeta_regularizer=zeta_regularizer) running_losses = [] running_estimates = [] for step in range(num_steps): transitions_batch = dataset.get_step(batch_size, num_steps=2) initial_steps_batch, _ = dataset.get_episode(batch_size, truncate_episode_at=1) initial_steps_batch = tf.nest.map_structure(lambda t: t[:, 0, ...], initial_steps_batch) losses = estimator.train_step(initial_steps_batch, transitions_batch, target_policy) running_losses.append(losses) if step % 500 == 0 or step == num_steps - 1: print('step', step, 'losses', np.mean(running_losses, 0)) estimate = estimator.estimate_average_reward( dataset, target_policy) running_estimates.append(estimate) print('estimated per step avg %f' % estimate) print('avg last 3 estimated per step avg %f' % np.mean(running_estimates[-3:])) if network_dir is not None: nu_network.save_weights( os.path.join(network_dir, hparam_net, 'nu')) zeta_network.save_weights( os.path.join(network_dir, hparam_net, 'zeta')) print('saved network weights to', os.path.join(network_dir, hparam_net)) running_losses = [] if num_steps == 0: estimate = estimator.estimate_average_reward(dataset, target_policy) running_estimates.append(estimate) print('eval only per step avg %f' % np.mean(running_estimates[-3:])) if estimate_dir is not None: out_fname = os.path.join(estimate_dir, hparam_result + '.npy') print('Saving estimation results to', out_fname) with tf.io.gfile.GFile(out_fname, 'w') as f: np.save(f, running_estimates) print('Done!')
def main(argv): env_name = FLAGS.env_name seed = FLAGS.seed tabular_obs = FLAGS.tabular_obs num_trajectory = FLAGS.num_trajectory max_trajectory_length = FLAGS.max_trajectory_length alpha = FLAGS.alpha load_dir = FLAGS.load_dir save_dir = FLAGS.save_dir gamma = FLAGS.gamma num_steps = FLAGS.num_steps divergence_limit = FLAGS.divergence_limit algae_alpha = FLAGS.algae_alpha assert 0 <= gamma < 1. limit_episodes = FLAGS.limit_episodes target_policy = get_target_policy(load_dir, env_name, tabular_obs) hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_' 'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format( ENV_NAME=env_name, TAB=tabular_obs, ALPHA=alpha, SEED=seed, NUM_TRAJ=num_trajectory, MAX_TRAJ=max_trajectory_length) directory = os.path.join(load_dir, hparam_str) print('Loading dataset.') dataset = Dataset.load(directory) all_steps = dataset.get_all_steps() max_reward = tf.reduce_max(all_steps.reward) min_reward = tf.reduce_min(all_steps.reward) print('num loaded steps', dataset.num_steps) print('num loaded total steps', dataset.num_total_steps) print('num loaded episodes', dataset.num_episodes) print('num loaded total episodes', dataset.num_total_episodes) print('min reward', min_reward, 'max reward', max_reward) estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma) print('data per step avg', estimate) train_hparam_str = ('limit{LIMIT}_' 'gam{GAMMA}_algae{ALGAE_ALPHA}_div{DIV}').format( LIMIT=limit_episodes, GAMMA=gamma, ALGAE_ALPHA=algae_alpha, DIV=divergence_limit) if save_dir is not None: save_dir = os.path.join(save_dir, hparam_str, train_hparam_str) summary_writer = tf.summary.create_file_writer(logdir=save_dir) else: summary_writer = tf.summary.create_noop_writer() estimator = TabularCoinDice(dataset_spec=dataset.spec, gamma=gamma, divergence_limit=divergence_limit, algae_alpha=algae_alpha * np.array([1, 1]), limit_episodes=limit_episodes) estimator.prepare_dataset(dataset, target_policy) global_step = tf.Variable(0, dtype=tf.int64) tf.summary.experimental.set_step(global_step) with summary_writer.as_default(): running_losses = [] running_estimates = [] for step in range(num_steps): loss = estimator.train_step(dataset, target_policy) running_losses.append(loss) global_step.assign_add(1) if step % 10 == 0 or step == num_steps - 1: print('step', step, 'losses', np.mean(running_losses, 0)) estimate = np.mean(running_losses, 0)[0] for idx, est in enumerate(estimate): tf.summary.scalar('estimate%d' % idx, est) running_estimates.append(estimate) print('estimated confidence interval %s' % estimate) print('avg last 3 estimated confidence interval %s' % np.mean(running_estimates[-3:], axis=0)) running_losses = [] if save_dir is not None: results_filename = os.path.join(save_dir, 'results.npy') with tf.io.gfile.GFile(results_filename, 'w') as f: np.save(f, running_estimates) print('Done!')
def main(argv): env_name = FLAGS.env_name seed = FLAGS.seed tabular_obs = FLAGS.tabular_obs num_trajectory = FLAGS.num_trajectory max_trajectory_length = FLAGS.max_trajectory_length load_dir = FLAGS.load_dir save_dir = FLAGS.save_dir gamma = FLAGS.gamma assert 0 <= gamma < 1. alpha = FLAGS.alpha alpha_target = FLAGS.alpha_target num_steps = FLAGS.num_steps batch_size = FLAGS.batch_size zeta_learning_rate = FLAGS.zeta_learning_rate nu_learning_rate = FLAGS.nu_learning_rate solve_for_state_action_ratio = FLAGS.solve_for_state_action_ratio eps_std = FLAGS.eps_std kl_regularizer = FLAGS.kl_regularizer target_policy = get_target_policy(load_dir, env_name, tabular_obs, alpha=alpha_target) hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_' 'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format( ENV_NAME=env_name, TAB=tabular_obs, ALPHA=alpha, SEED=seed, NUM_TRAJ=num_trajectory, MAX_TRAJ=max_trajectory_length) directory = os.path.join(load_dir, hparam_str) print('Loading dataset.') dataset = Dataset.load(directory) print('num loaded steps', dataset.num_steps) print('num loaded total steps', dataset.num_total_steps) print('num loaded episodes', dataset.num_episodes) print('num loaded total episodes', dataset.num_total_episodes) print('behavior per-step', estimator_lib.get_fullbatch_average(dataset, gamma=gamma)) train_hparam_str = ('eps{EPS}_kl{KL}').format(EPS=eps_std, KL=kl_regularizer) if save_dir is not None: # Save for a specific alpha target target_hparam_str = hparam_str.replace( 'alpha{}'.format(alpha), 'alpha{}_alphat{}'.format(alpha, alpha_target)) save_dir = os.path.join(save_dir, target_hparam_str, train_hparam_str) summary_writer = tf.summary.create_file_writer(logdir=save_dir) else: summary_writer = tf.summary.create_noop_writer() estimator = TabularBayesDice( dataset_spec=dataset.spec, gamma=gamma, solve_for_state_action_ratio=solve_for_state_action_ratio, zeta_learning_rate=zeta_learning_rate, nu_learning_rate=nu_learning_rate, kl_regularizer=kl_regularizer, eps_std=eps_std, ) estimator.prepare_dataset(dataset, target_policy) global_step = tf.Variable(0, dtype=tf.int64) tf.summary.experimental.set_step(global_step) with summary_writer.as_default(): running_losses = [] running_estimates = [] for step in range(num_steps): loss = estimator.train_step()[0] running_losses.append(loss) global_step.assign_add(1) if step % 500 == 0 or step == num_steps - 1: print('step', step, 'losses', np.mean(running_losses, 0)) estimate = estimator.estimate_average_reward( dataset, target_policy) tf.debugging.check_numerics(estimate, 'NaN in estimate') running_estimates.append(estimate) tf.print('est', tf.math.reduce_mean(estimate), tf.math.reduce_std(estimate)) running_losses = [] if save_dir is not None: with tf.io.gfile.GFile(os.path.join(save_dir, 'results.npy'), 'w') as f: np.save(f, running_estimates) print('saved results to %s' % save_dir) print('Done!')
def main(argv): env_name = FLAGS.env_name seed = FLAGS.seed tabular_obs = FLAGS.tabular_obs num_trajectory = FLAGS.num_trajectory max_trajectory_length = FLAGS.max_trajectory_length alpha = FLAGS.alpha load_dir = FLAGS.load_dir save_dir = FLAGS.save_dir gamma = FLAGS.gamma assert 0 <= gamma < 1. limit_episodes = FLAGS.limit_episodes target_policy = get_target_policy(load_dir, env_name, tabular_obs) hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_' 'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format( ENV_NAME=env_name, TAB=tabular_obs, ALPHA=alpha, SEED=seed, NUM_TRAJ=num_trajectory, MAX_TRAJ=max_trajectory_length) directory = os.path.join(load_dir, hparam_str) print('Loading dataset.') dataset = Dataset.load(directory) all_steps = dataset.get_all_steps() max_reward = tf.reduce_max(all_steps.reward) min_reward = tf.reduce_min(all_steps.reward) print('num loaded steps', dataset.num_steps) print('num loaded total steps', dataset.num_total_steps) print('num loaded episodes', dataset.num_episodes) print('num loaded total episodes', dataset.num_total_episodes) print('min reward', min_reward, 'max reward', max_reward) train_hparam_str = ('gamma{GAM}_limit{LIMIT}').format(GAM=gamma, LIMIT=limit_episodes) estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma) print('data per step avg', estimate) estimator = TabularQLearning( dataset.spec, gamma, num_qvalues=200, perturbation_scale=[0.0, 0.01, 0.02, 0.05, 0.1, 0.2, 0.4, 1.], default_reward_value=0.0, limit_episodes=limit_episodes) estimate = estimator.solve(dataset, target_policy) print('estimated per step avg', estimate) if save_dir is not None: results_dir = os.path.join(save_dir, hparam_str) if not tf.io.gfile.exists(results_dir): tf.io.gfile.makedirs(results_dir) results_filename = os.path.join(results_dir, 'results_%s.npy' % train_hparam_str) with tf.io.gfile.GFile(results_filename, 'w') as f: np.save(f, estimate) print('Done!')