def execute_maxent_causal(world, terminal, trajectories, discount=0.7): """ Maximum Causal Entropy Inverse Reinforcement Learning """ # set up features: we use one feature vector per state features = world.state_features() # choose our parameter initialization strategy: # initialize parameters with constant init = optimizer.Constant(1.0) # choose our optimization strategy: # we select exponentiated gradient descent with linear learning-rate decay optim = optimizer.ExpSga(lr=optimizer.linear_decay(lr0=0.2)) # actually do some inverse reinforcement learning reward = maxent.irl_causal(world.p_transition, features, terminal, trajectories, optim, init, discount) return reward
def maxent(world, terminal, trajectories): """ Maximum Entropy Inverse Reinforcement Learning """ # set up features: we use one feature vector per state features = W.state_features(world) # choose our parameter initialization strategy: # initialize parameters with constant init = O.Constant(1.0) # choose our optimization strategy: # we select exponentiated gradient descent with linear learning-rate decay optim = O.ExpSga(lr=O.linear_decay(lr0=0.2)) # actually do some inverse reinforcement learning reward = M.irl(world.p_transition, features, terminal, trajectories, optim, init) return reward
# re-compute detla for convergence check delta = np.max(np.abs(omega_old - omega)) # re-compute per-state reward and return return features.dot(omega) # set-up the GridWorld Markov Decision Process world, reward, terminal = setup_mdp() # generate some "expert" trajectories (and its policy for visualization) trajectories, expert_policy = generate_expert_trajectories( world, reward, terminal) # set up features: we use one feature vector per state features = W.state_features(world) # choose our parameter initialization strategy: # initialize parameters with constant init = O.Constant(1.0) # choose our optimization strategy: # we select exponentiated stochastic gradient descent with linear learning-rate decay optim = O.ExpSga(lr=O.linear_decay(lr0=0.2)) # actually do some inverse reinforcement learning reward_maxent = maxent_irl(world.p_transition, features, terminal, trajectories, optim, init) print("Done")