def get_expert_dataset( expert, venv, total_timesteps, ): filename = f"/tmp/{uuid.uuid4()}" n_episodes = total_timesteps // get_horizon(venv) generate_expert_traj(expert, save_path=filename, env=venv, n_episodes=n_episodes) dataset = ExpertDataset(expert_path=f"{filename}.npz", verbose=0) return dataset
def maximum_entropy_irl( venv, expert=None, expert_venv=None, expert_trajectories=None, causal=True, total_timesteps=10000, **kwargs, ): if expert_trajectories is None: expert_trajectories = sample_trajectories(expert_venv, expert, n_timesteps=total_timesteps) nS = venv.observation_space.n expert_occupancy = np.zeros(nS) for trj in expert_trajectories: for ob in trj.obs: expert_occupancy[ob] += 1.0 expert_occupancy /= expert_occupancy.sum() state_features = np.identity(nS) reward_model = LinearRewardModel(state_features) q_update_fn = mce_q_update_fn if causal else max_ent_q_update_fn horizon = get_horizon(venv) initial_state_distribution = get_initial_state_dist(venv) irl_reward, policy_matrix = occupancy_match_irl( dynamics=get_transition_matrix(venv), horizon=horizon, reward_model=reward_model, expert_occupancy=expert_occupancy, initial_state_distribution=initial_state_distribution, max_iterations=total_timesteps, q_update_fn=q_update_fn, ) policy = LightweightRLModel.from_matrix(policy_matrix, env=venv) results = {} results["reward_model"] = irl_reward results["policy"] = policy return results
def hard_value_iteration(venv, discount=1.0): horizon = get_horizon(venv) nS = venv.observation_space.n nA = venv.action_space.n reward_matrix = force_shape(get_reward_matrix(venv), (nS, nA, nS)) dynamics = get_transition_matrix(venv) Q = np.empty((horizon, nS, nA)) V = np.empty((horizon + 1, nS)) V[-1] = np.zeros(nS) for t in reversed(range(horizon)): for s in range(nS): for a in range(nA): Q[t, s, a] = dynamics[s, a, :] @ (reward_matrix[s, a, :] + discount * V[t + 1, :]) V[t] = np.max(Q[t], axis=1) policy = np.eye(nA)[Q.argmax(axis=2)] return policy
def soft_value_iteration(venv, beta=10): horizon = get_horizon(venv) nS = venv.observation_space.n nA = venv.action_space.n reward_matrix = force_shape(get_reward_matrix(venv), (nS, nA, nS)) dynamics = get_transition_matrix(venv) Q = np.empty((horizon, nS, nA)) V = np.empty((horizon + 1, nS)) V[-1] = np.zeros(nS) for t in reversed(range(horizon)): for s in range(nS): for a in range(nA): Q[t, s, a] = dynamics[s, a, :] @ (reward_matrix[s, a, :] + V[t + 1, :]) V[t] = logsumexp(Q[t], axis=1) policy = np.exp(Q - V[:-1, :, None]) return policy
def preferences( venv, expert=None, evaluate_trajectories_fn=None, n_pairs_per_batch=50, n_timesteps_per_query=None, reward_lr=1e-3, policy_lr=1e-3, policy_epoch_timesteps=200, total_timesteps=10000, state_only=False, use_rnd_bonus=False, rnd_lr=1e-3, rnd_coeff=0.5, normalize_extrinsic=False, egreedy_sampling=False, **kwargs, ): if n_pairs_per_batch is None: horizon = get_horizon(venv) n_pairs_per_batch = (n_timesteps_per_query / (2 * horizon)) if evaluate_trajectories_fn is None: reward_eval_fn = reward_eval_path_fn(venv) evaluate_trajectories_fn = get_eval_trajectories_fn(reward_eval_fn) # Create reward model rn = BasicShapedRewardNet( venv.observation_space, venv.action_space, theta_units=[32, 32], phi_units=[32, 32], scale=True, state_only=state_only, ) # Compute trajectory probabilities preferences_ph = tf.placeholder( shape=(None, 2), dtype=tf.float32, name="preferences", ) num_segments = 2 * tf.shape(preferences_ph)[0] rewards_out = tf.reshape(rn.reward_output_train, [num_segments, -1]) returns_out = tf.reduce_sum(rewards_out, axis=1) returns = tf.reshape(returns_out, shape=[-1, 2]) log_probs = tf.nn.log_softmax(returns, axis=1) # Write loss and optimizer op loss = (-1) * tf.reduce_sum(log_probs * preferences_ph) optimizer = tf.train.AdamOptimizer(learning_rate=reward_lr) reward_train_op = optimizer.minimize(loss) base_extrinsic_reward_fn = get_reward_fn_from_model(rn) if not use_rnd_bonus: reward_fn = base_extrinsic_reward_fn else: # Random network distillation bonus rnd_size = 50 inputs = [rn.obs_inp, rn.act_inp] inputs = [tf.layers.flatten(x) for x in inputs] inputs = tf.concat(inputs, axis=1) rnd_target_net = build_mlp([32, 32, 32], output_size=rnd_size) rnd_target = sequential(inputs, rnd_target_net) rnd_pred_net = build_mlp([32, 32, 32], output_size=rnd_size) rnd_pred = sequential(inputs, rnd_pred_net) rnd_loss = tf.reduce_mean((tf.stop_gradient(rnd_target) - rnd_pred)**2) rnd_optimizer = tf.train.AdamOptimizer(learning_rate=rnd_lr) rnd_train_op = rnd_optimizer.minimize(rnd_loss) runn_rnd_rews = RunningMeanVar(alpha=0.01) def rnd_reward_fn(obs, acts=None, *args, **kwargs): if acts is None: acts = [venv.action_space.sample()] int_rew = sess.run(rnd_loss, feed_dict={ rn.obs_ph: obs, rn.act_ph: acts }) int_rew_old = int_rew int_rew = runn_rnd_rews.exp_update(int_rew) return int_rew if normalize_extrinsic: runn_ext_rews = RunningMeanVar(alpha=0.01) def extrinsic_reward_fn(*args, **kwargs): ext_rew = base_extrinsic_reward_fn(*args, **kwargs) if normalize_extrinsic: ext_rew = runn_ext_rews.exp_update(ext_rew) return ext_rew def reward_fn(*args, **kwargs): return extrinsic_reward_fn( *args, **kwargs) + rnd_coeff * rnd_reward_fn(*args, **kwargs) # Create learner from reward model venv_train = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn) policy = PPO2(MlpPolicy, venv_train, learning_rate=policy_lr) # Start training sess = tf.get_default_session() sess.run(tf.global_variables_initializer()) sampling_policy = make_egreedy(policy, venv) if egreedy_sampling else policy num_epochs = int(np.ceil(total_timesteps / policy_epoch_timesteps)) for epoch in range(num_epochs): trajectories = sample_trajectories(venv, sampling_policy, 2 * n_pairs_per_batch) segments = get_segments(trajectories) seg_returns = evaluate_trajectories_fn(segments) seg_returns = seg_returns.reshape(-1, 2) preferences = np.stack( [ seg_returns[:, 0] > seg_returns[:, 1], seg_returns[:, 1] > seg_returns[:, 0], ], axis=1, ) obs = np.concatenate([seg.obs for seg in segments]) acts = np.concatenate([seg.acts for seg in segments]) next_obs = np.concatenate([seg.next_obs for seg in segments]) ops = [reward_train_op] if use_rnd_bonus: ops.append(rnd_train_op) sess.run( ops, feed_dict={ rn.obs_ph: obs, rn.act_ph: acts, rn.next_obs_ph: next_obs, preferences_ph: preferences, }, ) policy.learn(total_timesteps=policy_epoch_timesteps) results = {} results["reward_model"] = rn results["policy"] = policy return results