예제 #1
0
파일: lp.py 프로젝트: shunzh/RLCodeBase
def decomposePiLP(S, A, T, s0, terminal, rawX, x, gamma=1):
  """
  DEPRECATED.
  This tries to decouple a policy into the optimal policy (following no constraints) and another policy \pi'.
  \pi' may be a dominating policy.
  Described in Eq. 2 on Aug.29, 2017.
  """
  m = CPlexModel()
  if not config.VERBOSE: m.setVerbosity(0)

  # useful constants
  Sr = range(len(S))
  Ar = range(len(A))
 
  y = m.new((len(S), len(A)), lb=0, name='y')
  sigma = m.new(lb=0, ub=1, name='sigma')
  
  for s in Sr:
    for a in Ar:
      # note that x and rawX use S x A as domains
      m.constrain(sigma * rawX[S[s], A[a]] + y[s, a] == x[S[s], A[a]])

  # make sure y is a valid occupancy
  for sp in Sr:
    # x (x(s) - \gamma * T) = \sigma
    # and make sure there is no flow back from the terminal states
    if not terminal(S[sp]):
      m.constrain(sum(y[s, a] * ((s == sp) - gamma * T(S[s], A[a], S[sp]) * (not terminal(S[s]))) for s in Sr for a in Ar) == (1 - sigma) * (S[sp] == s0))
    else:
      m.constrain(sum(y[s, a] * ((s == sp) - gamma * T(S[s], A[a], S[sp]) * (not terminal(S[sp]))) for s in Sr for a in Ar) == (1 - sigma) * (S[sp] == s0))
 
  obj = m.maximize(sigma)
  
  # return sigma and the value of y
  return obj, {(S[s], A[a]): m[y][s, a] for s in Sr for a in Ar}
예제 #2
0
def decomposePiLP(S, A, T, s0, terminal, rawX, x, gamma=1):
  """
  This tries to decouple a policy into the optimal policy (following no constraints) and another policy \pi'.
  \pi' may be a dominating policy.
  Described in Eq. 2 on Aug.29, 2017.
  """
  m = CPlexModel()
  if not config.VERBOSE: m.setVerbosity(0)

  # useful constants
  Sr = range(len(S))
  Ar = range(len(A))
 
  y = m.new((len(S), len(A)), lb=0, name='y')
  sigma = m.new(lb=0, ub=1, name='sigma')
  
  for s in Sr:
    for a in Ar:
      # note that x and rawX use S x A as domains
      m.constrain(sigma * rawX[S[s], A[a]] + y[s, a] == x[S[s], A[a]])

  # make sure y is a valid occupancy
  for sp in Sr:
    # x (x(s) - \gamma * T) = \sigma
    # and make sure there is no flow back from the terminal states
    if not terminal(S[sp]):
      m.constrain(sum(y[s, a] * ((s == sp) - gamma * T(S[s], A[a], S[sp]) * (not terminal(S[s]))) for s in Sr for a in Ar) == (1 - sigma) * (S[sp] == s0))
    else:
      m.constrain(sum(y[s, a] * ((s == sp) - gamma * T(S[s], A[a], S[sp]) * (not terminal(S[sp]))) for s in Sr for a in Ar) == (1 - sigma) * (S[sp] == s0))
 
  obj = m.maximize(sigma)
  
  # return sigma and the value of y
  return obj, {(S[s], A[a]): m[y][s, a] for s in Sr for a in Ar}
예제 #3
0
파일: lp.py 프로젝트: shunzh/RLCodeBase
def lp(S, A, r, T, s0):
  """
  Solve the LP problem to find out the optimal occupancy
  
  Args:
    S: state set
    A: action set
    r: reward
    T: transition function
    s0: init state
  """
  m = CPlexModel()
  if not config.VERBOSE or config.DEBUG: m.setVerbosity(0)

  # useful constants
  Sr = range(len(S))
 
  v = m.new(len(S), name='v')

  for s in Sr:
    for a in A:
      m.constrain(v[s] >= r(S[s], a) + sum(v[sp] * T(S[s], a, S[sp]) for sp in Sr))
  
  # obj
  obj = m.minimize(v[s0])
  ret = util.Counter()
  for s in Sr:
    ret[S[s]] = m[v][s]
  return ret
예제 #4
0
def lp(S, A, r, T, s0):
  """
  Solve the LP problem to find out the optimal occupancy

  Args:
    S: state set
    A: action set
    r: reward
    T: transition function
    s0: init state
  """
  m = CPlexModel()
  if not config.VERBOSE or config.DEBUG: m.setVerbosity(0)

  # useful constants
  Sr = range(len(S))

  v = m.new(len(S), name='v')

  for s in Sr:
    for a in A:
      m.constrain(v[s] >= r(S[s], a) + sum(v[sp] * T(S[s], a, S[sp]) for sp in Sr))

  # obj
  obj = m.minimize(v[s0])
  ret = util.Counter()
  for s in Sr:
    ret[S[s]] = m[v][s]
  return ret
예제 #5
0
def lpDualCPLEX(mdp,
                zeroConstraints=(),
                positiveConstraints=(),
                positiveConstraintsOcc=1):
    """
  DEPRECATED since we moved to gurobi. but leave the function here for sanity check
  Solve the dual problem of lp, maybe with some constraints
  Same arguments
  Note that this is a lower level function that does not consider feature extraction.
  r should be a reward function, not a reward parameter.
  """
    S = mdp.S
    A = mdp.A
    T = mdp.T
    r = mdp.r
    gamma = mdp.gamma
    alpha = mdp.alpha

    m = CPlexModel()
    if not config.VERBOSE: m.setVerbosity(0)

    # useful constants
    Sr = range(len(S))
    Ar = range(len(A))

    x = m.new((len(S), len(A)), lb=0, name='x')

    # make sure x is a valid occupancy
    for sp in Sr:
        # x (x(s) - \gamma * T) = \sigma
        m.constrain(
            sum(x[s, a] * ((s == sp) - gamma * T(S[s], A[a], S[sp]))
                for s in Sr for a in Ar) == alpha(S[sp]))

    # == constraints
    if len(zeroConstraints) > 0:
        m.constrain(
            sum(x[S.index(s), A.index(a)] for s, a in zeroConstraints) == 0)

    # >= constraints
    if len(positiveConstraints) > 0:
        m.constrain(
            sum(x[S.index(s), A.index(a)]
                for s, a in positiveConstraints) >= positiveConstraintsOcc)

    # obj
    try:
        obj = m.maximize(sum([x[s, a] * r(S[s], A[a]) for s in Sr
                              for a in Ar]))
    except CPlexException as err:
        print 'Exception', err
        # we return obj value as None and occ measure as {}. this should be handled correctly
        return {'feasible': False}

    return {
        'feasible': True,
        'obj': obj,
        'pi': {(S[s], A[a]): m[x][s, a]
               for s in Sr for a in Ar}
    }
예제 #6
0
파일: lp.py 프로젝트: shunzh/RLCodeBase
def domPiMilp(S, A, r, T, s0, terminal, domPis, consIdx, gamma=1):
  """
  Finding dominating policies by representing constraints as possible negative rewards.
  Described in the report on aug.19, 2017.
  """
  rmax = 10000
  M = 0.001
  consLen = len(consIdx)

  m = CPlexModel()
  if not config.VERBOSE: m.setVerbosity(0)

  # state range
  Sr = range(len(S))
  # action range
  Ar = range(len(A))
  
  # decision variables
  x = m.new((len(S), len(A)), lb=0, name='x')
  z = m.new(consLen, vtype=bool, name='z')
  #z = [0, 1, 0] # test for office nav domain
  t = m.new(name='t')
  
  # flow conservation
  for sp in Sr:
    # x (x(s) - \gamma * T) = \sigma
    # and make sure there is no flow back from the terminal states
    if not terminal(S[sp]):
      m.constrain(sum(x[s, a] * ((s == sp) - gamma * T(S[s], A[a], S[sp]) * (not terminal(S[s]))) for s in Sr for a in Ar) == (S[sp] == s0))
      #print S[sp], [(S[s], A[a]) for s in Sr for a in Ar if T(S[s], A[a], S[sp]) > 0]
    else:
      m.constrain(sum(x[s, a] * ((s == sp) - gamma * T(S[s], A[a], S[sp]) * (not terminal(S[sp]))) for s in Sr for a in Ar) == (S[sp] == s0))

  # t is the lower bound of the difference between x and y
  # note: i don't think expressions in constraints can call other functions
  for y in domPis:
    # note that y is indexed by elements in S x A, not numbered indices
    m.constrain(sum(x[s, a] * r(S[s], A[a]) for s in Sr for a in Ar) -\
                sum(y[S[s], A[a]] *
                    (r(S[s], A[a]) + sum(- rmax * (S[s][consIdx[i]] != s0[consIdx[i]]) * z[i] for i in range(consLen)))\
                    for s in Sr for a in Ar)\
                >= t)
   
  for s in Sr:
    for i in range(consLen):
      if S[s][consIdx[i]] != s0[consIdx[i]]:
        for a in Ar:
          m.constrain(z[i] + M * x[s, a] <= 1)

  # obj
  obj = m.maximize(t)
  
  print m[z]
  
  return obj, {(S[s], A[a]): m[x][s, a] for s in Sr for a in Ar}
예제 #7
0
파일: lp.py 프로젝트: shunzh/RLCodeBase
def milp(S, A, R, T, s0, psi, maxV):
  """
  Solve the MILP problem in greedy construction of policy query
  
  Args:
    S: state set
    A: action set
    R: reward candidate set
    T: transition function
    s0: init state
    psi: prior belief on rewards
    maxV: maxV[i] = max_{\pi \in q} V_{r_i}^\pi
  """
  m = CPlexModel()
  if not config.VERBOSE: m.setVerbosity(0)

  # useful constants
  rLen = len(R)
  M = 10000 # a large number
  Sr = range(len(S))
  Ar = range(len(A))
  
  # decision variables
  # FIXME i removed upper bound of x. it shoundn't have such bound without transient-state assumption, right?
  x = m.new((len(S), len(A)), lb=0, name='x')
  z = m.new(rLen, vtype=bool, name='z')
  y = m.new(rLen, name='y')

  # constraints on y
  m.constrain([y[i] <= sum([x[s, a] * R[i](S[s], A[a]) for s in Sr for a in Ar]) - maxV[i] + (1 - z[i]) * M for i in xrange(rLen)])
  m.constrain([y[i] <= z[i] * M for i in xrange(rLen)])
  
  # constraints on x (valid occupancy)
  for sp in Sr:
    if S[sp] == s0:
      m.constrain(sum([x[sp, ap] for ap in Ar]) == 1)
    else:
      m.constrain(sum([x[sp, ap] for ap in Ar]) == sum([x[s, a] * T(S[s], A[a], S[sp]) for s in Sr for a in Ar]))
  
  # obj
  obj = m.maximize(sum([psi[i] * y[i] for i in xrange(rLen)]))

  if config.VERBOSE:
    print 'obj', obj
    print 'x', m[x]
    print 'y', m[y]
    print 'z', m[z]
  
  # build occupancy as S x A -> x[.,.]
  # z[i] == 1 then this policy is better than maxV on the i-th reward candidate
  res = util.Counter()
  for s in Sr:
    for a in Ar:
      res[S[s], A[a]] = m[x][s, a] 
  return res
예제 #8
0
def domPiMilp(S, A, r, T, s0, terminal, domPis, consIdx, gamma=1):
  """
  Finding dominating policies by representing constraints as possible negative rewards.
  Described in the report on aug.19, 2017.
  """
  rmax = 10000
  M = 0.001
  consLen = len(consIdx)

  m = CPlexModel()
  if not config.VERBOSE: m.setVerbosity(0)

  # state range
  Sr = range(len(S))
  # action range
  Ar = range(len(A))
  
  # decision variables
  x = m.new((len(S), len(A)), lb=0, name='x')
  z = m.new(consLen, vtype=bool, name='z')
  #z = [0, 1, 0] # test for office nav domain
  t = m.new(name='t')
  
  # flow conservation
  for sp in Sr:
    # x (x(s) - \gamma * T) = \sigma
    # and make sure there is no flow back from the terminal states
    if not terminal(S[sp]):
      m.constrain(sum(x[s, a] * ((s == sp) - gamma * T(S[s], A[a], S[sp]) * (not terminal(S[s]))) for s in Sr for a in Ar) == (S[sp] == s0))
      #print S[sp], [(S[s], A[a]) for s in Sr for a in Ar if T(S[s], A[a], S[sp]) > 0]
    else:
      m.constrain(sum(x[s, a] * ((s == sp) - gamma * T(S[s], A[a], S[sp]) * (not terminal(S[sp]))) for s in Sr for a in Ar) == (S[sp] == s0))

  # t is the lower bound of the difference between x and y
  # note: i don't think expressions in constraints can call other functions
  for y in domPis:
    # note that y is indexed by elements in S x A, not numbered indices
    m.constrain(sum(x[s, a] * r(S[s], A[a]) for s in Sr for a in Ar) -\
                sum(y[S[s], A[a]] *
                    (r(S[s], A[a]) + sum(- rmax * (S[s][consIdx[i]] != s0[consIdx[i]]) * z[i] for i in range(consLen)))\
                    for s in Sr for a in Ar)\
                >= t)
   
  for s in Sr:
    for i in range(consLen):
      if S[s][consIdx[i]] != s0[consIdx[i]]:
        for a in Ar:
          m.constrain(z[i] + M * x[s, a] <= 1)

  # obj
  obj = m.maximize(t)
  
  print m[z]
  
  return obj, {(S[s], A[a]): m[x][s, a] for s in Sr for a in Ar}
예제 #9
0
def milp(S, A, R, T, s0, psi, maxV):
  """
  Solve the MILP problem in greedy construction of policy query
  
  Args:
    S: state set
    A: action set
    R: reward candidate set
    T: transition function
    s0: init state
    psi: prior belief on rewards
    maxV: maxV[i] = max_{\pi \in q} V_{r_i}^\pi
  """
  m = CPlexModel()
  if not config.VERBOSE: m.setVerbosity(0)

  # useful constants
  rLen = len(R)
  M = 10000 # a large number
  Sr = range(len(S))
  Ar = range(len(A))
  
  # decision variables
  # FIXME i removed upper bound of x. it shoundn't have such bound without transient-state assumption, right?
  x = m.new((len(S), len(A)), lb=0, name='x')
  z = m.new(rLen, vtype=bool, name='z')
  y = m.new(rLen, name='y')

  # constraints on y
  m.constrain([y[i] <= sum([x[s, a] * R[i](S[s], A[a]) for s in Sr for a in Ar]) - maxV[i] + (1 - z[i]) * M for i in xrange(rLen)])
  m.constrain([y[i] <= z[i] * M for i in xrange(rLen)])
  
  # constraints on x (valid occupancy)
  for sp in Sr:
    if S[sp] == s0:
      m.constrain(sum([x[sp, ap] for ap in Ar]) == 1)
    else:
      m.constrain(sum([x[sp, ap] for ap in Ar]) == sum([x[s, a] * T(S[s], A[a], S[sp]) for s in Sr for a in Ar]))
  
  # obj
  obj = m.maximize(sum([psi[i] * y[i] for i in xrange(rLen)]))

  if config.VERBOSE:
    print 'obj', obj
    print 'x', m[x]
    print 'y', m[y]
    print 'z', m[z]
  
  # build occupancy as S x A -> x[.,.]
  # z[i] == 1 then this policy is better than maxV on the i-th reward candidate
  res = util.Counter()
  for s in Sr:
    for a in Ar:
      res[S[s], A[a]] = m[x][s, a] 
  return res
예제 #10
0
def findUndominatedReward(mdpH, mdpR, newPi, humanPi, localDifferentPis,
                          domPis):
    """
  Implementation of the linear programming problem (Eq.2) in report 12.5
  Returns the objective value and a reward function (which is only useful when the obj value is > 0)
  
  newPi is \hat{\pi} in the linear programming problem in the report.
  The robot tries to see if there exists a reward function where newPi is better than the best policy in domPis.
  """
    m = CPlexModel()
    if not config.VERBOSE: m.setVerbosity(0)

    S = mdpH.S
    robotA = mdpR.A
    humanA = mdpH.A

    # index of states and actions
    Sr = range(len(S))
    robotAr = range(len(robotA))
    humanAr = range(len(humanA))

    r = m.new(len(S), lb=0, ub=1, name='r')
    z = m.new(
        name='z'
    )  # when the optimal value is attained, z = \max_{domPi \in domPis} V^{domPi}_r

    for domPi in domPis:
        m.constrain(z >= sum(
            [domPi[S[s], robotA[a]] * r[s] for s in Sr for a in robotAr]))

    # make sure r is consistent with humanPi
    for s in S:
        for a in humanA:
            # humanPi is better than a locally different policy which takes action a in state a
            m.constrain(sum(sum((humanPi[S[sp], humanA[ap]] - localDifferentPis[s, a][S[sp], humanA[ap]]) for ap in humanAr)\
                            * r[sp] for sp in Sr) >= 0)

    # maxi_r { V^{newPi}_r - \max_{domPi \in domPis} V^{domPi}_r }
    cplexObj = m.maximize(
        sum(newPi[S[s], robotA[a]] * r[s] for s in Sr for a in robotAr) - z)

    obj = sum([newPi[S[s], robotA[a]] * m[r][s] for s in Sr
               for a in robotAr]) - m[z]

    # the reward function has the same values for same states, but need to convert back to the S x A space
    rFunc = lambda s, a: m[r][Sr.index(s)]

    print 'cplexobj', cplexObj
    print 'obj', obj
    print 'newPi'
    printPi(newPi)
    print 'z', m[z], 'r', m[r]

    return obj, rFunc
예제 #11
0
def rewardUncertainMILP(S, A, R, T, s0, terminal, k, optV, gamma=1):
    """
  The algorithm adapted from
  Viappiani, Paolo and Boutilier, CraigOptimal. set recommendations based on regret

  This algorithm would find the minimax-regret policy query in our problem.
  Not sure how to use this algorithm.
  """
    m = CPlexModel()
    if not config.VERBOSE: m.setVerbosity(0)

    M = 100000

    # state range
    Sr = range(len(S))
    # action range
    Ar = range(len(A))

    mr = m.new(name='mr')
    # decision variables
    x = m.new((k, len(S), len(A)), lb=0, name='x')
    v = m.new((k, len(R)), name='v')
    I = m.new((k, len(R)), vtype=bool, name='I')

    for r in range(len(R)):
        m.constrain(mr >= sum(v[i, r]) for i in range(k))

    for r in range(len(R)):
        for i in range(k):
            m.constrain(v[i, r] >= optV[r] - sum(x[i, s, a] * R[r](S[s], A[a])
                                                 for s in Sr for a in Ar) +
                        (I[i, r] - 1) * M)

    # make sure x is a valid occupancy
    for i in range(k):
        for sp in Sr:
            m.constrain(
                sum(x[i, s, a] * ((s == sp) - gamma * T(S[s], A[a], S[sp]))
                    for s in Sr for a in Ar) == (S[sp] == s0))

    for r in range(len(R)):
        m.constrain(sum(I[i, r] for i in range(k)) == 1)

    for r in range(len(R)):
        for i in range(k):
            m.constrain(v[i, r] >= 0)

    obj = m.minimize(mr)

    return obj, m[I]
예제 #12
0
def findUndominatedReward(mdpH, mdpR, newPi, humanPi, localDifferentPis, domPis):
  """
  Implementation of the linear programming problem (Eq.2) in report 12.5
  Returns the objective value and a reward function (which is only useful when the obj value is > 0)
  
  newPi is \hat{\pi} in the linear programming problem in the report.
  The robot tries to see if there exists a reward function where newPi is better than the best policy in domPis.
  """
  m = CPlexModel()
  if not config.VERBOSE: m.setVerbosity(0)
  
  S = mdpH.S
  robotA = mdpR.A
  humanA = mdpH.A

  # index of states and actions
  Sr = range(len(S))
  robotAr = range(len(robotA))
  humanAr = range(len(humanA))
  
  r = m.new(len(S), lb=0, ub=1, name='r')
  z = m.new(name='z') # when the optimal value is attained, z = \max_{domPi \in domPis} V^{domPi}_r

  for domPi in domPis:
    m.constrain(z >= sum([domPi[S[s], robotA[a]] * r[s] for s in Sr for a in robotAr]))
  
  # make sure r is consistent with humanPi
  for s in S:
    for a in humanA:
      # humanPi is better than a locally different policy which takes action a in state a
      m.constrain(sum(sum((humanPi[S[sp], humanA[ap]] - localDifferentPis[s, a][S[sp], humanA[ap]]) for ap in humanAr)\
                      * r[sp] for sp in Sr) >= 0)
    
  # maxi_r { V^{newPi}_r - \max_{domPi \in domPis} V^{domPi}_r }
  cplexObj = m.maximize(sum(newPi[S[s], robotA[a]] * r[s] for s in Sr for a in robotAr) - z)
  
  obj = sum([newPi[S[s], robotA[a]] * m[r][s] for s in Sr for a in robotAr]) - m[z]

  # the reward function has the same values for same states, but need to convert back to the S x A space
  rFunc = lambda s, a: m[r][Sr.index(s)]

  print 'cplexobj', cplexObj
  print 'obj', obj
  print 'newPi'
  printPi(newPi)
  print 'z', m[z], 'r', m[r]

  return obj, rFunc
예제 #13
0
파일: lp.py 프로젝트: shunzh/RLCodeBase
def lpDualCPLEX(mdp, zeroConstraints=[], positiveConstraints=[], positiveConstraintsOcc=1):
  """
  Solve the dual problem of lp, maybe with some constraints
  Same arguments

  Note that this is a lower level function that does not consider feature extraction.
  r should be a reward function, not a reward parameter.
  """

  S = mdp.S
  A = mdp.A
  T = mdp.T
  r = mdp.r
  gamma = mdp.gamma
  alpha = mdp.alpha

  m = CPlexModel()
  if not config.VERBOSE: m.setVerbosity(0)

  # useful constants
  Sr = range(len(S))
  Ar = range(len(A))

  x = m.new((len(S), len(A)), lb=0, name='x')

  # make sure x is a valid occupancy
  for sp in Sr:
    # x (x(s) - \gamma * T) = \sigma
    m.constrain(sum(x[s, a] * ((s == sp) - gamma * T(S[s], A[a], S[sp])) for s in Sr for a in Ar) == alpha(S[sp]))

  # == constraints
  if len(zeroConstraints) > 0:
    m.constrain(sum(x[S.index(s), A.index(a)] for s, a in zeroConstraints) == 0)

  # >= constraints
  if len(positiveConstraints) > 0:
    m.constrain(sum(x[S.index(s), A.index(a)] for s, a in positiveConstraints) >= positiveConstraintsOcc)

  # obj
  try:
    obj = m.maximize(sum([x[s, a] * r(S[s], A[a]) for s in Sr for a in Ar]))
  except CPlexException as err:
    print 'Exception', err
    # we return obj value as None and occ measure as {}. this should be handled correctly
    return {'feasible': False}

  return {'feasible': True, 'obj': obj, 'pi': {(S[s], A[a]): m[x][s, a] for s in Sr for a in Ar}}
예제 #14
0
파일: lp.py 프로젝트: shunzh/RLCodeBase
def rewardUncertainMILP(S, A, R, T, s0, terminal, k, optV, gamma=1):
  """
  The algorithm adapted from
  Viappiani, Paolo and Boutilier, CraigOptimal. set recommendations based on regret

  This algorithm would find the minimax-regret policy query in our problem.
  Not sure how to use this algorithm.
  """
  m = CPlexModel()
  if not config.VERBOSE: m.setVerbosity(0)

  M = 100000

  # state range
  Sr = range(len(S))
  # action range
  Ar = range(len(A))
  
  mr = m.new(name='mr')
  # decision variables
  x = m.new((k, len(S), len(A)), lb=0, name='x')
  v = m.new((k, len(R)), name='v')
  I = m.new((k, len(R)), vtype=bool, name='I')
  
  for r in range(len(R)):
    m.constrain(mr >= sum(v[i, r]) for i in range(k))
  
  for r in range(len(R)):
    for i in range(k):
      m.constrain(v[i, r] >= optV[r] - sum(x[i, s, a] * R[r](S[s], A[a]) for s in Sr for a in Ar) + (I[i, r] - 1) * M)

  # make sure x is a valid occupancy
  for i in range(k):
    for sp in Sr:
      m.constrain(sum(x[i, s, a] * ((s == sp) - gamma * T(S[s], A[a], S[sp])) for s in Sr for a in Ar) == (S[sp] == s0))
  
  for r in range(len(R)):
    m.constrain(sum(I[i, r] for i in range(k)) == 1)
  
  for r in range(len(R)):
    for i in range(k):
      m.constrain(v[i, r] >= 0)
  
  obj = m.minimize(mr)
  
  return obj, m[I]