Пример #1
0
    def findConstrainedOptPi(self,
                             activeCons=(),
                             addKnownLockedCons=True,
                             mdp=None):
        """
    :param activeCons:  constraints that should be followed
    :param mdp: use mdp.r by default
    :return: {'feasible': if solution exists; if not exists, this is the only property,
              'obj': the objective value,
              'pi': the policy found}
    """
        if mdp is None: mdp = self.mdp

        if addKnownLockedCons:
            activeCons = tuple(activeCons) + tuple(self.knownLockedCons)
        zeroConstraints = self.getGivenFeatCons(activeCons)

        if config.OPT_METHOD == 'gurobi':
            return lpDualGurobi(mdp,
                                zeroConstraints=zeroConstraints,
                                positiveConstraints=self.goalCons)
        elif config.OPT_METHOD == 'cplex':
            # not using this. only for comparision
            return lpDualCPLEX(mdp,
                               zeroConstraints=zeroConstraints,
                               positiveConstraints=self.goalCons)
        else:
            raise Exception('unknown method')
Пример #2
0
 def consistCond(res, idx):
   piValues = {}
   for piIdx in range(len(query)):
     obj, _ = lpDualGurobi(self.args['S'], self.args['A'], self.args['R'][idx], self.args['T'], self.args['s0'], query[piIdx])
     piValues[piIdx] = obj
   maxValue = max(piValues.values())
   optPiIdxs = filter(lambda piIdx: piValues[piIdx] == maxValue, range(len(query)))
   return any(res == query[piIdx] for piIdx in optPiIdxs)
Пример #3
0
 def consistCond(res, idx):
   maxV = None
   optCommit = None
   for commit in query:
     # compute the operator's value by following this commitment
     value, _ = lpDualGurobi(self.args['S'], self.args['A'], self.args['R'][idx], self.args['T'], self.args['s0'], commit)
     if value > maxV:
       maxV = value
       optCommit = commit
   return optCommit == res
Пример #4
0
 def consistCond(res, idx):
     piValues = {}
     for piIdx in range(len(query)):
         obj, _ = lpDualGurobi(self.args['S'], self.args['A'],
                               self.args['R'][idx], self.args['T'],
                               self.args['s0'], query[piIdx])
         piValues[piIdx] = obj
     maxValue = max(piValues.values())
     optPiIdxs = filter(lambda piIdx: piValues[piIdx] == maxValue,
                        range(len(query)))
     return any(res == query[piIdx] for piIdx in optPiIdxs)
Пример #5
0
  def findConstrainedOptPi(self, activeCons):
    mdp = copy.copy(self.mdp)

    zeroConstraints = self.constructConstraints(activeCons, mdp)

    if config.METHOD == 'gurobi':
      return lpDualGurobi(mdp, zeroConstraints=zeroConstraints)
    elif config.METHOD == 'cplex':
      return lpDualCPLEX(mdp, zeroConstraints=zeroConstraints)
    elif config.METHOD == 'mcts':
      return MCTS(**mdp)
    else:
      raise Exception('unknown method')
Пример #6
0
 def learn(self):
   args = {}
   args['S'] = self.mdp.getStates()
   args['A'] = self.mdp.getPossibleActions(self.mdp.state) # assume state actions are available for all states
   def transition(s, a, sp):
     trans = self.mdp.getTransitionStatesAndProbs(s, a)
     trans = filter(lambda (state, prob): state == sp, trans)
     if len(trans) > 0: return trans[0][1]
     else: return 0
   args['T'] = transition
   args['r'] = self.mdp.getReward
   args['s0'] = self.mdp.state
   self.opt, self.x = lpDualGurobi(**args)
Пример #7
0
    def findConstrainedOptPi(self, activeCons):
        mdp = copy.copy(self.mdp)

        zeroConstraints = self.constructConstraints(activeCons, mdp)

        if config.METHOD == 'gurobi':
            return lpDualGurobi(mdp, zeroConstraints=zeroConstraints)
        elif config.METHOD == 'cplex':
            return lpDualCPLEX(mdp, zeroConstraints=zeroConstraints)
        elif config.METHOD == 'mcts':
            return MCTS(**mdp)
        else:
            raise Exception('unknown method')
Пример #8
0
 def consistCond(res, idx):
     maxV = None
     optCommit = None
     for commit in query:
         # compute the operator's value by following this commitment
         value, _ = lpDualGurobi(self.args['S'], self.args['A'],
                                 self.args['R'][idx],
                                 self.args['T'], self.args['s0'],
                                 commit)
         if value > maxV:
             maxV = value
             optCommit = commit
     return optCommit == res
Пример #9
0
    def learn(self):
        args = {}
        args['S'] = self.mdp.getStates()
        args['A'] = self.mdp.getPossibleActions(
            self.mdp.state
        )  # assume state actions are available for all states

        def transition(s, a, sp):
            trans = self.mdp.getTransitionStatesAndProbs(s, a)
            trans = filter(lambda (state, prob): state == sp, trans)
            if len(trans) > 0: return trans[0][1]
            else: return 0

        args['T'] = transition
        args['r'] = self.mdp.getReward
        args['s0'] = self.mdp.state
        self.opt, self.x = lpDualGurobi(**args)
Пример #10
0
    def findConstrainedOptPi(self, activeCons):
        """
    :param activeCons:  constraints that should be followed
    :return: {'feasible': if solution exists; if not exists, this is the only property,
              'obj': the objective value,
              'pi': the policy found}
    """
        mdp = copy.copy(self.mdp)

        zeroConstraints = self.constructConstraints(
            tuple(activeCons) + tuple(self.knownLockedCons))

        if config.METHOD == 'gurobi':
            return lpDualGurobi(mdp,
                                zeroConstraints=zeroConstraints,
                                positiveConstraints=self.goalCons,
                                positiveConstraintsOcc=0.1)
        elif config.METHOD == 'cplex':
            # not using this. only for comparision
            return lpDualCPLEX(mdp,
                               zeroConstraints=zeroConstraints,
                               positiveConstraints=self.goalCons)
        else:
            raise Exception('unknown method')
Пример #11
0
def findDomPis(mdpH, mdpR, delta):
    """
  Implementation of algorithm 1 in report 12.5
  
  mdpH, mdpR: both agents' mdps. now we assume that they are only different in the action space:
  the robot's action set is a superset of the human's.

  delta: the actions that the robot can take and the human cannot.
  """
    # compute the set of state, action pairs that have different transitions under mdpH and mdpH
    S = mdpH.S
    robotA = mdpR.A
    T = mdpR.T  # note that the human and the robot have the same transition probabilities. The robot just has more actions
    gamma = mdpH.gamma

    # find the occupancy of policy humanPi from any state
    occupancies = {}

    mdpLocal = copy.deepcopy(mdpH)
    for s in S:
        mdpLocal.resetInitialState(s)
        objValue, pi = lp.lpDualGurobi(mdpLocal)

        for (deltaS, deltaA) in delta:
            # the human is unable to take this action, make sure here
            assert (deltaS, deltaA) not in pi.keys()
            pi[deltaS, deltaA] = 0
        occupancies[s] = pi
    # find the occupancy with uniform initial state distribution
    averageHumanOccupancy = {}
    for s0 in S:
        # passing mdpR because we need all actions
        occupancyAdd(mdpR, averageHumanOccupancy, occupancies[s0],
                     1.0 / len(S))

    # find the policies that are different from $\pi^*_\H$ only in one state
    localDifferentPis = {}
    for diffS in S:
        for diffA in robotA:
            pi = copy.deepcopy(averageHumanOccupancy)
            # remove the original occupancy
            occupancyAdd(mdpR, pi, occupancies[diffS], -1.0 / len(S))
            # add action (diffS, diffA)
            occupancyAdd(mdpR, pi, {(diffS, diffA): 1}, 1.0 / len(S))

            # update the occupancy of states that can be reached by taking diffA in diffS
            for sp in S:
                if T(diffS, diffA, sp) > 0:
                    occupancyAdd(mdpR, pi, occupancies[sp],
                                 1.0 / len(S) * gamma * T(diffS, diffA, sp))

            localDifferentPis[diffS, diffA] = pi

    print 'average human'
    printPi(averageHumanOccupancy)
    domPis = [averageHumanOccupancy]
    domPiAdded = True

    domRewards = []  # the optimal policies under which are dominating policies
    # repeat until domPis converges

    while domPiAdded:
        domPiAdded = False
        for (s, a) in delta:
            # change the action in state s from \pi^*_\H(s) to a
            newPi = localDifferentPis[s, a]

            print s, a
            objValue, r = findUndominatedReward(mdpH, mdpR, newPi,
                                                averageHumanOccupancy,
                                                localDifferentPis, domPis)

            if objValue > 0.0001:  # resolve numerical issues
                domRewards.append(r)

                # find the corresponding optimal policy and add to the set of dominating policies
                mdpR.r = r
                _, newDompi = lp.lpDualGurobi(mdpR)

                domPis.append(newDompi)
                print 'dompi added'
                printPi(newDompi)

                domPiAdded = True

            raw_input()

    return domPis
Пример #12
0
def findDomPis(mdpH, mdpR, delta):
  """
  Implementation of algorithm 1 in report 12.5
  
  mdpH, mdpR: both agents' mdps. now we assume that they are only different in the action space:
  the robot's action set is a superset of the human's.

  delta: the actions that the robot can take and the human cannot.
  """
  # compute the set of state, action pairs that have different transitions under mdpH and mdpH
  S = mdpH.S
  robotA = mdpR.A
  T = mdpR.T # note that the human and the robot have the same transition probabilities. The robot just has more actions
  gamma = mdpH.gamma
 
  # find the occupancy of policy humanPi from any state
  occupancies = {}
  
  mdpLocal = copy.deepcopy(mdpH)
  for s in S:
    mdpLocal.resetInitialState(s)
    objValue, pi = lp.lpDualGurobi(mdpLocal)
    
    for (deltaS, deltaA) in delta:
      # the human is unable to take this action, make sure here
      assert (deltaS, deltaA) not in pi.keys()
      pi[deltaS, deltaA] = 0
    occupancies[s] = pi
  # find the occupancy with uniform initial state distribution
  averageHumanOccupancy = {}
  for s0 in S:
    # passing mdpR because we need all actions
    occupancyAdd(mdpR, averageHumanOccupancy, occupancies[s0], 1.0 / len(S))

  # find the policies that are different from $\pi^*_\H$ only in one state
  localDifferentPis = {}
  for diffS in S:
    for diffA in robotA:
      pi = copy.deepcopy(averageHumanOccupancy)
      # remove the original occupancy
      occupancyAdd(mdpR, pi, occupancies[diffS], - 1.0 / len(S))
      # add action (diffS, diffA)
      occupancyAdd(mdpR, pi, {(diffS, diffA): 1}, 1.0 / len(S))

      # update the occupancy of states that can be reached by taking diffA in diffS
      for sp in S:
        if T(diffS, diffA, sp) > 0:
          occupancyAdd(mdpR, pi, occupancies[sp], 1.0 / len(S) * gamma * T(diffS, diffA, sp))
          
      localDifferentPis[diffS, diffA] = pi

  print 'average human'
  printPi(averageHumanOccupancy)
  domPis = [averageHumanOccupancy]
  domPiAdded = True
 
  domRewards = [] # the optimal policies under which are dominating policies
  # repeat until domPis converges

  while domPiAdded:
    domPiAdded = False
    for (s, a) in delta:
      # change the action in state s from \pi^*_\H(s) to a
      newPi = localDifferentPis[s, a]

      print s, a
      objValue, r = findUndominatedReward(mdpH, mdpR, newPi, averageHumanOccupancy, localDifferentPis, domPis)
      
      if objValue > 0.0001: # resolve numerical issues
        domRewards.append(r)
      
        # find the corresponding optimal policy and add to the set of dominating policies
        mdpR.r = r
        _, newDompi = lp.lpDualGurobi(mdpR)

        domPis.append(newDompi)
        print 'dompi added'
        printPi(newDompi)
        
        domPiAdded = True

      raw_input()
  
  return domPis