def generate_lfd_task(mdp: MDP, beta, nTrajs, nSteps): # compute optimal policy, Q-values and corresponding advantage values _, Q, pi_opt = mdp.policyIteration() # Q = normalizeQ(Q) # compute softmax policy pi_dem = softmaxPolicy(Q, beta) # generate demonstration set trajs = mdp.sampleTrajectories(nTrajs, nSteps, pi_dem, dropLastState=True) S, A = trajs['states'], trajs['actions'] return pi_dem, pi_opt, S, A
def generateImitationTask(mdp: MDP, R, beta, nTrajs, nSteps): # set reward function mdp.R = R # compute optimal policy, Q-values and corresponding advantage values _, Q, pi_opt = mdp.policyIteration() Q_norm = normalizeQ(Q) # compute softmax policy pi = softmaxPolicy(Q_norm, beta) # generate demonstration set trajs = mdp.sampleTrajectories(nTrajs, nSteps, pi, dropLastState=True) S, A = trajs['states'], trajs['actions'] return pi, pi_opt, S, A
def __init__(self, R=None, discount=None, motionPatterns=None, motionMethod='stay', walls=None, shape=None, circular=False): # initialize reward and discount through base class constructor MDP.__init__(self, R=R, discount=discount) # store motion specifications if motionMethod not in ('stay', 'renormalize'): raise ValueError(f"motion method '{motionMethod}' not available") self.motionMethod = motionMethod self.motionPatterns = motionPatterns self.circular = circular # construct map self.walls_2d = self.constructMap(walls, shape)
def planner(T): mdp = MDP(T, R, discount) V, Q, pi = mdp.policyIteration() return Q
def computeValue(self, mdp: MDP, pi): """Returns the value of a given policy by computation or lookup.""" if self.lastInput[0] != mdp or np.any(self.lastInput[1] != pi): self.lastInput = (mdp, pi) self.lastResult = mdp.policyEvaluation(pi).mean() return self.lastResult
def estimatePolicy(data, env: mdp.MDP, prior, beta, nSamples=100, nBurnin=50): if prior == 'pg': # create PG model as prior for the subgoal assignments distmat = positions2distmat(env.statePositions) distmat_kernel = normalize01(distmat + distmat.T) ** 2 Sigma = NegExponential(distmat_kernel) PG = PgMultNormal(M=env.nStates, K_prime=env.nStates-1, mu=None, Sigma=Sigma, nonInformative=False) def fitGoalDists(goals): # represent goals as "histograms" (i.e. data for the PG model) goal_hists = mdp.det2stoch(goals, env.nStates) PG.fit(goal_hists) sg_mean = PG.mean_variational_posterior(proj_2_prob=True)['Pi'] # sg_sample = PG.sample_var_posterior(1, proj_2_prob=True)['Pi'] return sg_mean # return sg_sample elif prior == 'dir': def fitGoalDists(goals): alpha = 1e-3 goal_hists = mdp.det2stoch(goals, env.nStates) + alpha return goal_hists / goal_hists.sum(axis=1) # compute subgoal values and softmax policies _, Q, _ = env.goalPlanning() # Q = mdp.normalizeQ(Q) goalPolicies = softmax(beta * Q, axis=1) # evaluate the action likelihoods under all subgoal policies L = mdp.actionLikelihoods(data, goalPolicies, logOut=True) # create container for Gibbs samples and initialize first sample randomly G = np.zeros((nSamples, env.nStates), dtype=int) G[0, :] = np.random.randint(0, env.nStates, env.nStates) def gibbs_step(goals): # get Gibbs sample from the PG conditional distribution goal_distributions = fitGoalDists(goals) # combine with "prior" (i.e. the action likelihoods) in the log domain and convert back to linear domain weights = softmax(np.log(goal_distributions) + L, axis=1) # return a sample from the resulting conditional distribution return sampleDiscrete(weights, axis=1) # run the Gibbs sampler for i in range(nSamples - 1): print(i) G[i+1, :] = gibbs_step(G[i, :]) # discard the burnin samples G = G[nBurnin + 1:] # construct policy estimate by averaging the subgoal policies of all samples pi = np.zeros((env.nStates, env.nActions)) for goals in G: pi += np.array([goalPolicies[s, :, g] for s, g in enumerate(goals)]) pi /= len(G) return pi
def experiment(c_mdp, c_gw, c_rand, c_dir, c_pg, c_sweep, id=0): # set random seed np.random.seed(id) # container to store the results result = np.full([ len(c_sweep['envs']), len(c_sweep['methods']), len(c_sweep['divergences']), len(c_sweep['nTrajs']) ], fill_value=np.nan) # sweep number of trajectories for tr, nTrajs in enumerate(c_sweep['nTrajs']): # sweep environments for env, envName in enumerate(c_sweep['envs']): if envName == 'randomMDP': mdp = MDP.DirichletMDP( nStates=c_rand['nStates'], nActions=c_rand['nActions'], linkProbability=c_rand['linkProbability'], alpha=c_rand['alpha']) env_params = c_rand mdp.discount = c_mdp['discount'] elif envName == 'gridworld': # create gridworld once (to avoid recomputation of its properties) if 'gw' not in locals(): gw = Gridworld(shape=c_gw['shape'], motionPatterns=c_gw['motionPatterns'], discount=c_mdp['discount']) env_params = c_gw mdp = gw else: raise ValueError('unknown environment') # generate random reward function R = np.zeros(mdp.nStates) rewardStates = np.random.choice(range(mdp.nStates), c_mdp['nRewards'], replace=False) R[rewardStates] = np.random.random(c_mdp['nRewards']) # generate task pi, pi_opt, S, A = generateImitationTask(mdp, R, c_mdp['beta'], nTrajs, c_mdp['nSteps']) # sweep inference methods for m, method in enumerate(c_sweep['methods']): # covariance matrix for polya-gamma inference if method in ('act_pg_traveltime', 'act_subgoal_traveltime'): travelTimes = mdp.minimumTravelTimes() Sigma = distmat2covmat(travelTimes) elif method in ('act_pg_eucl', 'sg_pg_eucl'): if envName == 'randomMDP': continue distmat = positions2distmat(mdp.statePositions) Sigma = distmat2covmat(distmat) # inference parameters if method == 'act_dir': params = (c_dir['alpha_act'], ) elif method == 'act_pg_default': params = (c_pg['mu'], None, False, c_pg['nSamples'], c_pg['nBurnin']) elif method == 'act_pg_eucl': params = (c_pg['mu'], Sigma, False, c_pg['nSamples'], c_pg['nBurnin']) elif method == 'act_pg_traveltime': params = (c_pg['mu'], Sigma, False, c_pg['nSamples'], c_pg['nBurnin']) elif method == 'act_pg_noninf': params = (c_pg['mu'], None, True, c_pg['nSamples'], c_pg['nBurnin']) elif method == 'sg_dir': subgoalModel = ('dir', dict(alpha=c_dir['alpha_sg'])) params = (subgoalModel, c_pg['beta'], c_pg['nSamples'], c_pg['nBurnin']) elif method == 'sg_pg_default': subgoalModel = ('pg', dict(mu=c_pg['mu'], Sigma=None, nonInformative=False)) params = (subgoalModel, c_pg['beta'], c_pg['nSamples'], c_pg['nBurnin']) elif method == 'sg_pg_eucl': subgoalModel = ('pg', dict(mu=c_pg['mu'], Sigma=Sigma, nonInformative=False)) params = (subgoalModel, c_pg['beta'], c_pg['nSamples'], c_pg['nBurnin']) elif method == 'sg_pg_traveltime': subgoalModel = ('pg', dict(mu=c_pg['mu'], Sigma=Sigma, nonInformative=False)) params = (subgoalModel, c_pg['beta'], c_pg['nSamples'], c_pg['nBurnin']) elif method == 'sg_pg_noninf': subgoalModel = ('pg', dict(mu=c_pg['mu'], Sigma=None, nonInformative=True)) params = (subgoalModel, c_pg['beta'], c_pg['nSamples'], c_pg['nBurnin']) # perform inference pi_hat = imitationLearning(mdp, S, A, method, params) # compute MAP action assignment # pi_hat = np.argmax(pi_hat, axis=1) # pi_hat = det2stoch(pi_hat, mdp.nActions) # sweep divergence measures for d, div in enumerate(c_sweep['divergences']): result[env, m, d, tr] = policyDivergence( pi_opt, pi_hat, div, mdp=mdp, distMat=env_params['actionDistances']).mean() return result
def dir_mean(mdp: MDP, S, A, alpha): D = mdp.trajs2dems(S, A) return (D + alpha) / (D + alpha).sum(axis=1, keepdims=True)