def findConstrainedOptPi(self, activeCons=(), addKnownLockedCons=True, mdp=None): """ :param activeCons: constraints that should be followed :param mdp: use mdp.r by default :return: {'feasible': if solution exists; if not exists, this is the only property, 'obj': the objective value, 'pi': the policy found} """ if mdp is None: mdp = self.mdp if addKnownLockedCons: activeCons = tuple(activeCons) + tuple(self.knownLockedCons) zeroConstraints = self.getGivenFeatCons(activeCons) if config.OPT_METHOD == 'gurobi': return lpDualGurobi(mdp, zeroConstraints=zeroConstraints, positiveConstraints=self.goalCons) elif config.OPT_METHOD == 'cplex': # not using this. only for comparision return lpDualCPLEX(mdp, zeroConstraints=zeroConstraints, positiveConstraints=self.goalCons) else: raise Exception('unknown method')
def consistCond(res, idx): piValues = {} for piIdx in range(len(query)): obj, _ = lpDualGurobi(self.args['S'], self.args['A'], self.args['R'][idx], self.args['T'], self.args['s0'], query[piIdx]) piValues[piIdx] = obj maxValue = max(piValues.values()) optPiIdxs = filter(lambda piIdx: piValues[piIdx] == maxValue, range(len(query))) return any(res == query[piIdx] for piIdx in optPiIdxs)
def consistCond(res, idx): maxV = None optCommit = None for commit in query: # compute the operator's value by following this commitment value, _ = lpDualGurobi(self.args['S'], self.args['A'], self.args['R'][idx], self.args['T'], self.args['s0'], commit) if value > maxV: maxV = value optCommit = commit return optCommit == res
def findConstrainedOptPi(self, activeCons): mdp = copy.copy(self.mdp) zeroConstraints = self.constructConstraints(activeCons, mdp) if config.METHOD == 'gurobi': return lpDualGurobi(mdp, zeroConstraints=zeroConstraints) elif config.METHOD == 'cplex': return lpDualCPLEX(mdp, zeroConstraints=zeroConstraints) elif config.METHOD == 'mcts': return MCTS(**mdp) else: raise Exception('unknown method')
def learn(self): args = {} args['S'] = self.mdp.getStates() args['A'] = self.mdp.getPossibleActions(self.mdp.state) # assume state actions are available for all states def transition(s, a, sp): trans = self.mdp.getTransitionStatesAndProbs(s, a) trans = filter(lambda (state, prob): state == sp, trans) if len(trans) > 0: return trans[0][1] else: return 0 args['T'] = transition args['r'] = self.mdp.getReward args['s0'] = self.mdp.state self.opt, self.x = lpDualGurobi(**args)
def learn(self): args = {} args['S'] = self.mdp.getStates() args['A'] = self.mdp.getPossibleActions( self.mdp.state ) # assume state actions are available for all states def transition(s, a, sp): trans = self.mdp.getTransitionStatesAndProbs(s, a) trans = filter(lambda (state, prob): state == sp, trans) if len(trans) > 0: return trans[0][1] else: return 0 args['T'] = transition args['r'] = self.mdp.getReward args['s0'] = self.mdp.state self.opt, self.x = lpDualGurobi(**args)
def findConstrainedOptPi(self, activeCons): """ :param activeCons: constraints that should be followed :return: {'feasible': if solution exists; if not exists, this is the only property, 'obj': the objective value, 'pi': the policy found} """ mdp = copy.copy(self.mdp) zeroConstraints = self.constructConstraints( tuple(activeCons) + tuple(self.knownLockedCons)) if config.METHOD == 'gurobi': return lpDualGurobi(mdp, zeroConstraints=zeroConstraints, positiveConstraints=self.goalCons, positiveConstraintsOcc=0.1) elif config.METHOD == 'cplex': # not using this. only for comparision return lpDualCPLEX(mdp, zeroConstraints=zeroConstraints, positiveConstraints=self.goalCons) else: raise Exception('unknown method')
def findDomPis(mdpH, mdpR, delta): """ Implementation of algorithm 1 in report 12.5 mdpH, mdpR: both agents' mdps. now we assume that they are only different in the action space: the robot's action set is a superset of the human's. delta: the actions that the robot can take and the human cannot. """ # compute the set of state, action pairs that have different transitions under mdpH and mdpH S = mdpH.S robotA = mdpR.A T = mdpR.T # note that the human and the robot have the same transition probabilities. The robot just has more actions gamma = mdpH.gamma # find the occupancy of policy humanPi from any state occupancies = {} mdpLocal = copy.deepcopy(mdpH) for s in S: mdpLocal.resetInitialState(s) objValue, pi = lp.lpDualGurobi(mdpLocal) for (deltaS, deltaA) in delta: # the human is unable to take this action, make sure here assert (deltaS, deltaA) not in pi.keys() pi[deltaS, deltaA] = 0 occupancies[s] = pi # find the occupancy with uniform initial state distribution averageHumanOccupancy = {} for s0 in S: # passing mdpR because we need all actions occupancyAdd(mdpR, averageHumanOccupancy, occupancies[s0], 1.0 / len(S)) # find the policies that are different from $\pi^*_\H$ only in one state localDifferentPis = {} for diffS in S: for diffA in robotA: pi = copy.deepcopy(averageHumanOccupancy) # remove the original occupancy occupancyAdd(mdpR, pi, occupancies[diffS], -1.0 / len(S)) # add action (diffS, diffA) occupancyAdd(mdpR, pi, {(diffS, diffA): 1}, 1.0 / len(S)) # update the occupancy of states that can be reached by taking diffA in diffS for sp in S: if T(diffS, diffA, sp) > 0: occupancyAdd(mdpR, pi, occupancies[sp], 1.0 / len(S) * gamma * T(diffS, diffA, sp)) localDifferentPis[diffS, diffA] = pi print 'average human' printPi(averageHumanOccupancy) domPis = [averageHumanOccupancy] domPiAdded = True domRewards = [] # the optimal policies under which are dominating policies # repeat until domPis converges while domPiAdded: domPiAdded = False for (s, a) in delta: # change the action in state s from \pi^*_\H(s) to a newPi = localDifferentPis[s, a] print s, a objValue, r = findUndominatedReward(mdpH, mdpR, newPi, averageHumanOccupancy, localDifferentPis, domPis) if objValue > 0.0001: # resolve numerical issues domRewards.append(r) # find the corresponding optimal policy and add to the set of dominating policies mdpR.r = r _, newDompi = lp.lpDualGurobi(mdpR) domPis.append(newDompi) print 'dompi added' printPi(newDompi) domPiAdded = True raw_input() return domPis
def findDomPis(mdpH, mdpR, delta): """ Implementation of algorithm 1 in report 12.5 mdpH, mdpR: both agents' mdps. now we assume that they are only different in the action space: the robot's action set is a superset of the human's. delta: the actions that the robot can take and the human cannot. """ # compute the set of state, action pairs that have different transitions under mdpH and mdpH S = mdpH.S robotA = mdpR.A T = mdpR.T # note that the human and the robot have the same transition probabilities. The robot just has more actions gamma = mdpH.gamma # find the occupancy of policy humanPi from any state occupancies = {} mdpLocal = copy.deepcopy(mdpH) for s in S: mdpLocal.resetInitialState(s) objValue, pi = lp.lpDualGurobi(mdpLocal) for (deltaS, deltaA) in delta: # the human is unable to take this action, make sure here assert (deltaS, deltaA) not in pi.keys() pi[deltaS, deltaA] = 0 occupancies[s] = pi # find the occupancy with uniform initial state distribution averageHumanOccupancy = {} for s0 in S: # passing mdpR because we need all actions occupancyAdd(mdpR, averageHumanOccupancy, occupancies[s0], 1.0 / len(S)) # find the policies that are different from $\pi^*_\H$ only in one state localDifferentPis = {} for diffS in S: for diffA in robotA: pi = copy.deepcopy(averageHumanOccupancy) # remove the original occupancy occupancyAdd(mdpR, pi, occupancies[diffS], - 1.0 / len(S)) # add action (diffS, diffA) occupancyAdd(mdpR, pi, {(diffS, diffA): 1}, 1.0 / len(S)) # update the occupancy of states that can be reached by taking diffA in diffS for sp in S: if T(diffS, diffA, sp) > 0: occupancyAdd(mdpR, pi, occupancies[sp], 1.0 / len(S) * gamma * T(diffS, diffA, sp)) localDifferentPis[diffS, diffA] = pi print 'average human' printPi(averageHumanOccupancy) domPis = [averageHumanOccupancy] domPiAdded = True domRewards = [] # the optimal policies under which are dominating policies # repeat until domPis converges while domPiAdded: domPiAdded = False for (s, a) in delta: # change the action in state s from \pi^*_\H(s) to a newPi = localDifferentPis[s, a] print s, a objValue, r = findUndominatedReward(mdpH, mdpR, newPi, averageHumanOccupancy, localDifferentPis, domPis) if objValue > 0.0001: # resolve numerical issues domRewards.append(r) # find the corresponding optimal policy and add to the set of dominating policies mdpR.r = r _, newDompi = lp.lpDualGurobi(mdpR) domPis.append(newDompi) print 'dompi added' printPi(newDompi) domPiAdded = True raw_input() return domPis