def main(argv): load_dir = FLAGS.load_dir save_dir = FLAGS.save_dir env_name = FLAGS.env_name seed = FLAGS.seed tabular_obs = FLAGS.tabular_obs num_trajectory = FLAGS.num_trajectory max_trajectory_length = FLAGS.max_trajectory_length alpha = FLAGS.alpha alpha_target = FLAGS.alpha_target gamma = FLAGS.gamma nu_learning_rate = FLAGS.nu_learning_rate zeta_learning_rate = FLAGS.zeta_learning_rate nu_regularizer = FLAGS.nu_regularizer zeta_regularizer = FLAGS.zeta_regularizer num_steps = FLAGS.num_steps batch_size = FLAGS.batch_size f_exponent = FLAGS.f_exponent primal_form = FLAGS.primal_form primal_regularizer = FLAGS.primal_regularizer dual_regularizer = FLAGS.dual_regularizer kl_regularizer = FLAGS.kl_regularizer zero_reward = FLAGS.zero_reward norm_regularizer = FLAGS.norm_regularizer zeta_pos = FLAGS.zeta_pos scale_reward = FLAGS.scale_reward shift_reward = FLAGS.shift_reward transform_reward = FLAGS.transform_reward kl_regularizer = FLAGS.kl_regularizer eps_std = FLAGS.eps_std def reward_fn(env_step): reward = env_step.reward * scale_reward + shift_reward if transform_reward is None: return reward if transform_reward == 'exp': reward = tf.math.exp(reward) elif transform_reward == 'cuberoot': reward = tf.sign(reward) * tf.math.pow(tf.abs(reward), 1.0 / 3.0) else: raise ValueError( 'Reward {} not implemented.'.format(transform_reward)) return reward hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_' 'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format( ENV_NAME=env_name, TAB=tabular_obs, ALPHA=alpha, SEED=seed, NUM_TRAJ=num_trajectory, MAX_TRAJ=max_trajectory_length) train_hparam_str = ( 'nlr{NLR}_zlr{ZLR}_zeror{ZEROR}_preg{PREG}_dreg{DREG}_kreg{KREG}_nreg{NREG}_' 'pform{PFORM}_fexp{FEXP}_zpos{ZPOS}_' 'scaler{SCALER}_shiftr{SHIFTR}_transr{TRANSR}').format( NLR=nu_learning_rate, ZLR=zeta_learning_rate, ZEROR=zero_reward, PREG=primal_regularizer, DREG=dual_regularizer, KREG=kl_regularizer, NREG=norm_regularizer, PFORM=primal_form, FEXP=f_exponent, ZPOS=zeta_pos, SCALER=scale_reward, SHIFTR=shift_reward, TRANSR=transform_reward, ) train_hparam_str = ('eps{EPS}_kl{KL}').format(EPS=eps_std, KL=kl_regularizer) if save_dir is not None: target_hparam_str = hparam_str.replace( 'alpha{}'.format(alpha), 'alpha{}_alphat{}'.format(alpha, alpha_target)) save_dir = os.path.join(save_dir, target_hparam_str, train_hparam_str) summary_writer = tf.summary.create_file_writer(logdir=save_dir) summary_writer.set_as_default() else: tf.summary.create_noop_writer() directory = os.path.join(load_dir, hparam_str) print('Loading dataset from', directory) dataset = Dataset.load(directory) #dataset = Dataset.load(directory.replace('alpha{}'.format(alpha), 'alpha0.0')) all_steps = dataset.get_all_steps() max_reward = tf.reduce_max(all_steps.reward) min_reward = tf.reduce_min(all_steps.reward) print('num loaded steps', dataset.num_steps) print('num loaded total steps', dataset.num_total_steps) print('num loaded episodes', dataset.num_episodes) print('num loaded total episodes', dataset.num_total_episodes) print('min reward', min_reward, 'max reward', max_reward) print('behavior per-step', estimator_lib.get_fullbatch_average(dataset, gamma=gamma)) activation_fn = tf.nn.relu kernel_initializer = tf.keras.initializers.GlorotUniform() hidden_dims = (64, 64) input_spec = (dataset.spec.observation, dataset.spec.action) nu_network = ValueNetwork(input_spec, output_dim=2, fc_layer_params=hidden_dims, activation_fn=activation_fn, kernel_initializer=kernel_initializer, last_kernel_initializer=kernel_initializer) output_activation_fn = tf.math.square if zeta_pos else tf.identity zeta_network = ValueNetwork(input_spec, output_dim=2, fc_layer_params=hidden_dims, activation_fn=activation_fn, output_activation_fn=output_activation_fn, kernel_initializer=kernel_initializer, last_kernel_initializer=kernel_initializer) nu_optimizer = tf.keras.optimizers.Adam(nu_learning_rate) zeta_optimizer = tf.keras.optimizers.Adam(zeta_learning_rate) lam_optimizer = tf.keras.optimizers.Adam(nu_learning_rate) estimator = NeuralBayesDice(dataset.spec, nu_network, zeta_network, nu_optimizer, zeta_optimizer, lam_optimizer, gamma, zero_reward=zero_reward, f_exponent=f_exponent, primal_form=primal_form, reward_fn=reward_fn, primal_regularizer=primal_regularizer, dual_regularizer=dual_regularizer, kl_regularizer=kl_regularizer, eps_std=FLAGS.eps_std, norm_regularizer=norm_regularizer, nu_regularizer=nu_regularizer, zeta_regularizer=zeta_regularizer) global_step = tf.Variable(0, dtype=tf.int64) tf.summary.experimental.set_step(global_step) target_policy = get_target_policy(load_dir, env_name, tabular_obs, alpha_target) running_losses = [] all_dual = [] for step in range(num_steps): transitions_batch = dataset.get_step(batch_size, num_steps=2) initial_steps_batch, _ = dataset.get_episode(batch_size, truncate_episode_at=1) initial_steps_batch = tf.nest.map_structure(lambda t: t[:, 0, ...], initial_steps_batch) losses = estimator.train_step(initial_steps_batch, transitions_batch, target_policy) running_losses.append(losses) if step % 500 == 0 or step == num_steps - 1: num_samples = 100 dual_ests = [] for i in range(num_samples): dual_est = estimator.estimate_average_reward( dataset, target_policy, write_summary=(i == 0)) dual_ests.append(dual_est) tf.summary.scalar('dual/mean', tf.math.reduce_mean(dual_ests)) tf.summary.scalar('dual/std', tf.math.reduce_std(dual_ests)) tf.print('dual/mean =', tf.math.reduce_mean(dual_ests), 'dual/std =', tf.math.reduce_std(dual_ests)) all_dual.append(dual_ests) running_losses = [] global_step.assign_add(1) if save_dir is not None: np.save(tf.io.gfile.GFile(os.path.join(save_dir, 'results.npy'), 'w'), all_dual) print('Done!')
def main(argv): env_name = FLAGS.env_name seed = FLAGS.seed tabular_obs = FLAGS.tabular_obs num_trajectory = FLAGS.num_trajectory max_trajectory_length = FLAGS.max_trajectory_length alpha = FLAGS.alpha load_dir = FLAGS.load_dir save_dir = FLAGS.save_dir policy_learning_rate = FLAGS.policy_learning_rate q_learning_rate = FLAGS.q_learning_rate batch_size = FLAGS.batch_size mode = FLAGS.mode ci_method = FLAGS.ci_method delta = FLAGS.delta delta_tail = 1 - delta gamma = FLAGS.gamma num_steps = FLAGS.num_steps use_trained_policy = FLAGS.use_trained_policy use_doubly_robust = FLAGS.use_doubly_robust assert 0 <= gamma < 1. target_policy = get_target_policy(load_dir, env_name, tabular_obs) hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_' 'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format( ENV_NAME=env_name, TAB=tabular_obs, ALPHA=alpha, SEED=seed, NUM_TRAJ=num_trajectory, MAX_TRAJ=max_trajectory_length) if FLAGS.num_trajectory_data is not None: hparam_str_data = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_' 'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format( ENV_NAME=env_name, TAB=tabular_obs, ALPHA=alpha, SEED=seed, NUM_TRAJ=FLAGS.num_trajectory_data, MAX_TRAJ=max_trajectory_length) else: hparam_str_data = hparam_str directory = os.path.join(load_dir, hparam_str_data) print('Loading dataset.') dataset = Dataset.load(directory) all_steps = dataset.get_all_steps() max_reward = tf.reduce_max(all_steps.reward) min_reward = tf.reduce_min(all_steps.reward) print('num loaded steps', dataset.num_steps) print('num loaded total steps', dataset.num_total_steps) print('num loaded episodes', dataset.num_episodes) print('num loaded total episodes', dataset.num_total_episodes) print('min reward', min_reward, 'max reward', max_reward) estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma) print('data per step avg', estimate) train_hparam_str = ( 'plr{P_LR}_tp{TRAINED_P}_batch{BATCH_SIZE}_mode{MODE}_CI{CI_METHOD}_UTP{USE_TRAINED_POLICY}_gam{GAMMA}_del{DELTA}' ).format( P_LR=policy_learning_rate, TRAINED_P=use_trained_policy, BATCH_SIZE=batch_size, MODE=mode, CI_METHOD=ci_method, USE_TRAINED_POLICY=use_trained_policy, GAMMA=gamma, DELTA=delta) if save_dir is not None: save_dir = os.path.join(save_dir, hparam_str, train_hparam_str) summary_writer = tf.summary.create_file_writer(logdir=save_dir) else: summary_writer = tf.summary.create_noop_writer() def non_negative_reward_translation(env_step): return env_step.reward - min_reward def inv_non_negative_estimate_translation(estimate): return estimate + min_reward if use_trained_policy: activation_fn = tf.nn.relu kernel_initializer = tf.keras.initializers.GlorotUniform() hidden_dims = (64, 64) policy_optimizer = tf.keras.optimizers.Adam( policy_learning_rate, beta_1=0.0, beta_2=0.0) policy_network = PolicyNetwork( dataset.spec.observation, dataset.spec.action, fc_layer_params=hidden_dims, activation_fn=activation_fn, kernel_initializer=kernel_initializer, last_kernel_initializer=kernel_initializer) else: policy_optimizer = None policy_network = None if use_doubly_robust: activation_fn = tf.nn.relu kernel_initializer = tf.keras.initializers.GlorotUniform() hidden_dims = (64, 64) q_optimizer = tf.keras.optimizers.Adam(q_learning_rate) q_network = ValueNetwork( (dataset.spec.observation, dataset.spec.action), fc_layer_params=hidden_dims, activation_fn=activation_fn, kernel_initializer=kernel_initializer, last_kernel_initializer=kernel_initializer) else: q_optimizer = None q_network = None estimator = ImportanceSamplingCI( dataset_spec=dataset.spec, policy_optimizer=policy_optimizer, policy_network=policy_network, mode=mode, ci_method=ci_method, delta_tail=delta_tail, gamma=gamma, reward_fn=non_negative_reward_translation, q_network=q_network, q_optimizer=q_optimizer) global_step = tf.Variable(0, dtype=tf.int64) tf.summary.experimental.set_step(global_step) # Following is for policy learning + IS confidence interval @tf.function def one_step(data_batch): global_step.assign_add(1) loss = estimator.train_step(data_batch, target_policy) return loss with summary_writer.as_default(): running_losses = [] running_estimates = [] running_estimate_cis = [] for step in range(num_steps): data_batch = dataset.get_step(batch_size, num_steps=2) loss = one_step(data_batch) running_losses.append(loss) if step % 500 == 0 or step == num_steps - 1: print('step', step, 'loss', np.mean(running_losses, 0)) running_losses = [] estimate = estimator.estimate_average_reward( dataset, target_policy, episode_limit=num_trajectory) estimate = inv_non_negative_estimate_translation(estimate) running_estimates.append(estimate) print('estimated per step avg %s' % estimate) print('avg last 3 estimated per step avg %s' % np.mean(running_estimates[-3:], axis=0)) estimate_ci = estimator.estimate_reward_ci(dataset, target_policy) estimate_ci = np.array( [inv_non_negative_estimate_translation(ele) for ele in estimate_ci]) running_estimate_cis.append(estimate_ci) print('estimated CI per step avg %s' % estimate_ci) print('avg last 3 estimated CI per step avg %s' % np.mean(running_estimate_cis[-3:], axis=0)) if save_dir is not None: results_filename = os.path.join(save_dir, 'results.npy') with tf.io.gfile.GFile(results_filename, 'w') as f: np.save(f, running_estimate_cis) print('Done!')
def main(argv): env_name = FLAGS.env_name seed = FLAGS.seed tabular_obs = FLAGS.tabular_obs num_trajectory = FLAGS.num_trajectory max_trajectory_length = FLAGS.max_trajectory_length alpha = FLAGS.alpha load_dir = FLAGS.load_dir gamma = FLAGS.gamma assert 0 <= gamma < 1. learning_rate = FLAGS.learning_rate nstep_returns = FLAGS.nstep_returns num_steps = FLAGS.num_steps batch_size = FLAGS.batch_size target_policy = get_target_policy(load_dir, env_name, tabular_obs) hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_' 'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format( ENV_NAME=env_name, TAB=tabular_obs, ALPHA=alpha, SEED=seed, NUM_TRAJ=num_trajectory, MAX_TRAJ=max_trajectory_length) directory = os.path.join(load_dir, hparam_str) print('Loading dataset.') dataset = Dataset.load(directory) all_steps = dataset.get_all_steps() max_reward = tf.reduce_max(all_steps.reward) min_reward = tf.reduce_min(all_steps.reward) print('num loaded steps', dataset.num_steps) print('num loaded total steps', dataset.num_total_steps) print('num loaded episodes', dataset.num_episodes) print('num loaded total episodes', dataset.num_total_episodes) print('min reward', min_reward, 'max reward', max_reward) estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma) print('data per step avg', estimate) dataset = PerturbedDataset(dataset, num_perturbations=10, perturbation_scale=1.) #estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma) #print('perturbed data per step avg', estimate) value_network = ValueNetwork( (dataset.spec.observation, dataset.spec.action), fc_layer_params=(64, 64), output_dim=10) optimizer = tf.keras.optimizers.Adam(learning_rate) estimator = NeuralQLearning(dataset.spec, value_network, optimizer, gamma, num_qvalues=10) for step in range(num_steps): batch = dataset.get_step(batch_size, num_steps=nstep_returns + 1) loss, _ = estimator.train_step(batch, target_policy) if step % 100 == 0 or step == num_steps - 1: print('step', step, 'loss', loss) estimate = estimator.estimate_average_reward( dataset, target_policy) print('estimated per step avg', estimate) print('Done!')
def main(argv): env_name = FLAGS.env_name seed = FLAGS.seed tabular_obs = FLAGS.tabular_obs num_trajectory = FLAGS.num_trajectory max_trajectory_length = FLAGS.max_trajectory_length alpha = FLAGS.alpha load_dir = FLAGS.load_dir save_dir = FLAGS.save_dir gamma = FLAGS.gamma assert 0 <= gamma < 1. nu_learning_rate = FLAGS.nu_learning_rate zeta_learning_rate = FLAGS.zeta_learning_rate nu_regularizer = FLAGS.nu_regularizer zeta_regularizer = FLAGS.zeta_regularizer weight_learning_rate = FLAGS.weight_learning_rate divergence_limit = FLAGS.divergence_limit algae_alpha = FLAGS.algae_alpha f_exponent = FLAGS.f_exponent primal_form = FLAGS.primal_form batch_size = FLAGS.batch_size num_steps = FLAGS.num_steps target_policy = get_target_policy(load_dir, env_name, tabular_obs) hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_' 'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format( ENV_NAME=env_name, TAB=tabular_obs, ALPHA=alpha, SEED=seed, NUM_TRAJ=num_trajectory, MAX_TRAJ=max_trajectory_length) directory = os.path.join(load_dir, hparam_str) print('Loading dataset.') dataset = Dataset.load(directory) all_steps = dataset.get_all_steps() max_reward = tf.reduce_max(all_steps.reward) min_reward = tf.reduce_min(all_steps.reward) print('num loaded steps', dataset.num_steps) print('num loaded total steps', dataset.num_total_steps) print('num loaded episodes', dataset.num_episodes) print('num loaded total episodes', dataset.num_total_episodes) print('min reward', min_reward, 'max reward', max_reward) estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma) print('data per step avg', estimate) train_hparam_str = ('nlr{NU_LR}_zlr{Z_LR}_batch{BATCH_SIZE}_' 'gam{GAMMA}_nreg{NU_REG}_zreg{Z_REG}_algae{ALGAE_ALPHA}_' 'prim{PRIMAL}_div{DIV}').format( NU_LR=nu_learning_rate, Z_LR=zeta_learning_rate, BATCH_SIZE=batch_size, GAMMA=gamma, NU_REG=nu_regularizer, Z_REG=zeta_regularizer, ALGAE_ALPHA=algae_alpha, PRIMAL=primal_form, DIV=divergence_limit) if save_dir is not None: save_dir = os.path.join(save_dir, hparam_str, train_hparam_str) summary_writer = tf.summary.create_file_writer(logdir=save_dir) else: summary_writer = tf.summary.create_noop_writer() activation_fn = tf.nn.relu kernel_initializer = tf.keras.initializers.TruncatedNormal( stddev=0.5, seed=1) hidden_dims = (64,) n_intervals = 1 nu_network = ValueNetwork((dataset.spec.observation, dataset.spec.action), fc_layer_params=hidden_dims, activation_fn=activation_fn, kernel_initializer=kernel_initializer, last_kernel_initializer=None, output_dim=2 * 2 * n_intervals) zeta_network = ValueNetwork((dataset.spec.observation, dataset.spec.action), fc_layer_params=hidden_dims, activation_fn=activation_fn, kernel_initializer=kernel_initializer, last_kernel_initializer=None, output_dim=2 * 2 * n_intervals) weight_network = ValueNetwork((dataset.spec.observation, # initial state dataset.spec.observation, # cur state dataset.spec.action, # cur action dataset.spec.observation), # next state fc_layer_params=hidden_dims, activation_fn=activation_fn, kernel_initializer=kernel_initializer, last_kernel_initializer=None, output_dim=2 * n_intervals) nu_optimizer = tf.keras.optimizers.Adam(nu_learning_rate, beta_2=0.99) zeta_optimizer = tf.keras.optimizers.Adam(zeta_learning_rate, beta_2=0.99) weight_optimizer = tf.keras.optimizers.Adam(weight_learning_rate, beta_2=0.99) estimator = NeuralCoinDice(dataset.spec, nu_network, zeta_network, weight_network, nu_optimizer, zeta_optimizer, weight_optimizer, gamma=gamma, divergence_limit=divergence_limit, f_exponent=f_exponent, primal_form=primal_form, nu_regularizer=nu_regularizer, zeta_regularizer=zeta_regularizer, algae_alpha=algae_alpha * np.array([1, 1]), unbias_algae_alpha=False, closed_form_weights=True, num_samples=None) global_step = tf.Variable(0, dtype=tf.int64) tf.summary.experimental.set_step(global_step) @tf.function def one_step(transitions_batch, initial_steps_batch): global_step.assign_add(1) with tf.summary.record_if(tf.math.mod(global_step, 25) == 0): initial_steps_batch = tf.nest.map_structure(lambda t: t[:, 0, ...], initial_steps_batch) losses, _ = estimator.train_step(initial_steps_batch, transitions_batch, target_policy) return losses with summary_writer.as_default(): running_losses = [] running_estimates = [] for step in range(num_steps): transitions_batch = dataset.get_step(batch_size, num_steps=2) initial_steps_batch, _ = dataset.get_episode( batch_size, truncate_episode_at=1) losses = one_step(transitions_batch, initial_steps_batch) running_losses.append([t.numpy() for t in losses]) if step % 500 == 0 or step == num_steps - 1: print('step', step, 'losses', np.mean(running_losses, 0)) estimate = np.mean(running_losses, 0)[0] for idx, est in enumerate(estimate): tf.summary.scalar('estimate%d' % idx, est) running_estimates.append(estimate) print('estimated confidence interval %s' % estimate) print('avg last 3 estimated confidence interval %s' % np.mean(running_estimates[-3:], axis=0)) running_losses = [] if save_dir is not None: results_filename = os.path.join(save_dir, 'results.npy') with tf.io.gfile.GFile(results_filename, 'w') as f: np.save(f, running_estimates) print('Done!')
def main(argv): env_name = FLAGS.env_name data_name = FLAGS.data_name seed = FLAGS.seed policy_load_dir = FLAGS.policy_load_dir data_load_dir = FLAGS.data_load_dir gamma = FLAGS.gamma assert 0 <= gamma < 1. learning_rate = FLAGS.learning_rate nstep_returns = FLAGS.nstep_returns num_steps = FLAGS.num_steps batch_size = FLAGS.batch_size target_policy, env = get_target_policy(policy_load_dir, env_name) directory = os.path.join(data_load_dir, 'yifan_%s_%s' % (env_name, data_name)) print('Loading dataset.') onpolicy_dataset = TFAgentsOnpolicyDataset(env, target_policy, 1000) write_dataset = TFOffpolicyDataset(onpolicy_dataset.spec) batch_size = 20 num_trajectory = 10 for batch_num in range(1 + (num_trajectory - 1) // batch_size): print(batch_num) num_trajectory_after_batch = min(num_trajectory, batch_size * (batch_num + 1)) num_trajectory_to_get = num_trajectory_after_batch - batch_num * batch_size episodes, valid_steps = onpolicy_dataset.get_episode( batch_size=num_trajectory_to_get) add_episodes_to_dataset(episodes, valid_steps, write_dataset) dataset = write_dataset """ dataset = Dataset.load(directory) """ all_steps = dataset.get_all_steps() max_reward = tf.reduce_max(all_steps.reward) min_reward = tf.reduce_min(all_steps.reward) print('num loaded steps', dataset.num_steps) print('num loaded total steps', dataset.num_total_steps) print('num loaded episodes', dataset.num_episodes) print('num loaded total episodes', dataset.num_total_episodes) print('min reward', min_reward, 'max reward', max_reward) estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma) print('data per step avg', estimate) dataset = PerturbedDataset( dataset, num_perturbations=None, #10, perturbation_scale=1.) value_network = ValueNetwork( (dataset.spec.observation, dataset.spec.action), fc_layer_params=(64, 64), output_dim=None) #10) optimizer = tf.keras.optimizers.Adam(learning_rate) estimator = NeuralQLearning( dataset.spec, value_network, optimizer, gamma, num_qvalues=None, #10, num_samples=1) for step in range(num_steps): batch = dataset.get_step(batch_size, num_steps=nstep_returns + 1) loss, _ = estimator.train_step(batch, target_policy) if step % 100 == 0 or step == num_steps - 1: print('step', step, 'loss', loss) estimate = estimator.estimate_average_reward( dataset, target_policy) print('estimated per step avg', estimate) print('Done!')