def learn(self): args = easyDomains.convert(self.cmp, self.rewardSet, self.phi) rewardCandNum = len(self.rewardSet) k = config.NUMBER_OF_RESPONSES hValues = util.Counter() for sIdx in range(config.SAMPLE_TIMES): s = self.cmp.sampleState() indices = numpy.random.choice(range(rewardCandNum), k, replace=False) trajs = [ self.sampleTrajFromRewardCandidate(idx, s) for idx in indices ] if any(len(u) < config.TRAJECTORY_LENGTH for u in trajs): continue for i in xrange(k): for j in xrange(k): hValues[( s, tuple(indices))] += self.cmp.getTrajectoryDistance( trajs[i], trajs[j]) maxH = max(hValues.values()) maxStatesIndices = filter(lambda _: hValues[_] == maxH, hValues.keys()) maxState, maxIndices = random.choice(maxStatesIndices) trajs = [ self.sampleTrajFromRewardCandidate(idx, maxState) for idx in maxIndices ] return trajs, None
def learn(self): args = easyDomains.convert(self.cmp, self.rewardSet, self.phi) rewardCandNum = len(self.rewardSet) k = config.NUMBER_OF_RESPONSES hValues = util.Counter() for sIdx in range(config.SAMPLE_TIMES): s = self.cmp.sampleState() indices = numpy.random.choice(range(rewardCandNum), k, replace=False) trajs = [ self.sampleTrajFromRewardCandidate(idx, s) for idx in indices ] if any(len(u) < config.TRAJECTORY_LENGTH for u in trajs): continue # compute the different between new psi and old psi psiProbs = self.getPossiblePhiAndProbs(trajs) for psi, prob in psiProbs: # note that we need to keep the information of which state to generate queries # and what reward candidates the policies are optimazing hValues[(s, tuple(indices))] += prob * sum( abs(p1 - p2) for p1, p2 in zip(psi, self.phi)) maxH = max(hValues.values()) maxStatesIndices = filter(lambda _: hValues[_] == maxH, hValues.keys()) maxState, maxIndices = random.choice(maxStatesIndices) trajs = [ self.sampleTrajFromRewardCandidate(idx, maxState) for idx in maxIndices ] return trajs, None
def learn(self): args = easyDomains.convert(self.cmp, self.rewardSet, self.phi) rewardCandNum = len(self.rewardSet) if self.queryType == QueryType.DEMONSTRATION: k = config.NUMBER_OF_RESPONSES else: raise Exception("query type not implemented") # now q is a set of TRAJECTORIES q = [] for i in range(k): if i == 0: args['maxV'] = [0] * rewardCandNum else: # find the optimal policy so far that achieves the best on each reward candidate args['maxV'] = [] for rewardId in xrange(rewardCandNum): args['maxV'].append(max([self.computeV(pi, args['S'], args['A'], args['R'][rewardId], self.cmp.horizon) for pi in q])) x = lp.milp(**args) if self.heuristic: #TODO what to do with this x for demonstration purpose pass q.append(self.sampleTrajectory(x)) objValue = self.getQValue(self.cmp.state, None, q) if self.queryType == QueryType.DEMONSTRATION: return q, None, objValue else: raise Exception("query type not implemented")
def learn(self): args = easyDomains.convert(self.cmp, self.rewardSet, self.phi) rewardCandNum = len(args['R']) horizon = self.cmp.horizon k = config.NUMBER_OF_RESPONSES maxV = -numpy.inf maxQ = None for iterIdx in range(config.SAMPLE_TIMES): q = [] for i in xrange(k): theta = [ -0.5 + random.random() for _ in xrange(self.featLength) ] q.append(self.thetaToOccupancy(theta)) maxVs = [] for rewardId in xrange(rewardCandNum): maxVs.append( max([ self.computeV(pi, args['S'], args['A'], args['R'][rewardId], horizon) for pi in q ])) objValue = sum(maxVs[idx] * self.phi[idx] for idx in range(rewardCandNum)) if objValue > maxV: maxV = objValue maxQ = q return maxQ, None
def learn(self): args = easyDomains.convert(self.cmp, self.rewardSet, self.phi) rewardCandNum = len(self.rewardSet) k = config.NUMBER_OF_RESPONSES hValues = util.Counter() for sIdx in range(config.SAMPLE_TIMES): s = self.cmp.sampleState() indices = numpy.random.choice(range(rewardCandNum), k, replace=False) trajs = [self.sampleTrajFromRewardCandidate(idx, s) for idx in indices] if any(len(u) < config.TRAJECTORY_LENGTH for u in trajs): continue # compute the different between new psi and old psi psiProbs = self.getPossiblePhiAndProbs(trajs) for psi, prob in psiProbs: # note that we need to keep the information of which state to generate queries # and what reward candidates the policies are optimazing hValues[(s, tuple(indices))] += prob * sum(abs(p1 - p2) for p1, p2 in zip(psi, self.phi)) maxH = max(hValues.values()) maxStatesIndices = filter(lambda _: hValues[_] == maxH, hValues.keys()) maxState, maxIndices = random.choice(maxStatesIndices) trajs = [self.sampleTrajFromRewardCandidate(idx, maxState) for idx in maxIndices] return trajs, None
def learn(self): args = easyDomains.convert(self.cmp, self.rewardSet, self.phi) k = config.NUMBER_OF_RESPONSES # find an arbitrary state to generate trajectory queries # make sure that the length of the query is TRAJECTORY_LENGTH while True: s = self.cmp.sampleState() q = [tuple(self.sampleTrajectory(None, s, hori=config.TRAJECTORY_LENGTH, to='trajectory')) for _ in xrange(k)] if any(len(u) < config.TRAJECTORY_LENGTH for u in q): continue else: break return q, None
def learn(self): rewardSet = [self.mdp.getReward] psi = [1] # the agent is certain on the reward functions args = easyDomains.convert(self.mdp, rewardSet, psi) args['maxV'] = [-numpy.inf] self.agent = PolicyGradientQueryAgent(self.mdp, rewardSet, psi, QueryType.POLICY, self.feat, self.featLength, self.discount) self.optPi = self.agent.findNextPolicy(**args) self.x = lambda s, a: self.optPi(s, a) return self.optPi
def learn(self): args = easyDomains.convert(self.cmp, self.rewardSet, self.phi) k = config.NUMBER_OF_RESPONSES # find an arbitrary state to generate trajectory queries # make sure that the length of the query is TRAJECTORY_LENGTH while True: s = self.cmp.sampleState() q = [ tuple( self.sampleTrajectory(None, s, hori=config.TRAJECTORY_LENGTH, to='trajectory')) for _ in xrange(k) ] if any(len(u) < config.TRAJECTORY_LENGTH for u in q): continue else: break return q, None
def learn(self): args = easyDomains.convert(self.cmp, self.rewardSet, self.phi) rewardCandNum = len(args['R']) k = config.NUMBER_OF_RESPONSES assert k == 2 # not going to work for k > 2 maxV = -numpy.inf maxQ = None for iterIdx in range(config.SAMPLE_TIMES): selector = [random.random() > .5 for _ in xrange(rewardCandNum)] # got two psis psi0 = [self.phi[_] if selector[_] else 0 for _ in xrange(rewardCandNum)] psi1 = [self.phi[_] if not selector[_] else 0 for _ in xrange(rewardCandNum)] agent0 = self.getFiniteVIAgent(psi0, self.cmp.horizon - self.cmp.getResponseTime(), self.cmp.terminalReward, posterior=True) agent1 = self.getFiniteVIAgent(psi1, self.cmp.horizon - self.cmp.getResponseTime(), self.cmp.terminalReward, posterior=True) v = agent0.getValue(self.cmp.state) + agent1.getValue(self.cmp.state) if v > maxV: maxV = v maxQ = [agent0.x, agent1.x] return maxQ, None
def learn(self): args = easyDomains.convert(self.cmp, self.rewardSet, self.phi) rewardCandNum = len(self.rewardSet) k = config.NUMBER_OF_RESPONSES hValues = util.Counter() for sIdx in range(config.SAMPLE_TIMES): s = self.cmp.sampleState() indices = numpy.random.choice(range(rewardCandNum), k, replace=False) trajs = [self.sampleTrajFromRewardCandidate(idx, s) for idx in indices] if any(len(u) < config.TRAJECTORY_LENGTH for u in trajs): continue for i in xrange(k): for j in xrange(k): hValues[(s, tuple(indices))] += self.cmp.getTrajectoryDistance(trajs[i], trajs[j]) maxH = max(hValues.values()) maxStatesIndices = filter(lambda _: hValues[_] == maxH, hValues.keys()) maxState, maxIndices = random.choice(maxStatesIndices) trajs = [self.sampleTrajFromRewardCandidate(idx, maxState) for idx in maxIndices] return trajs, None
def learn(self): args = easyDomains.convert(self.cmp, self.rewardSet, self.phi) rewardCandNum = len(args['R']) k = config.NUMBER_OF_RESPONSES assert k == 2 # not going to work for k > 2 maxV = -numpy.inf maxQ = None for iterIdx in range(config.SAMPLE_TIMES): selector = [random.random() > .5 for _ in xrange(rewardCandNum)] # got two psis psi0 = [ self.phi[_] if selector[_] else 0 for _ in xrange(rewardCandNum) ] psi1 = [ self.phi[_] if not selector[_] else 0 for _ in xrange(rewardCandNum) ] agent0 = self.getFiniteVIAgent(psi0, self.cmp.horizon - self.cmp.getResponseTime(), self.cmp.terminalReward, posterior=True) agent1 = self.getFiniteVIAgent(psi1, self.cmp.horizon - self.cmp.getResponseTime(), self.cmp.terminalReward, posterior=True) v = agent0.getValue(self.cmp.state) + agent1.getValue( self.cmp.state) if v > maxV: maxV = v maxQ = [agent0.x, agent1.x] return maxQ, None
def learn(self): args = easyDomains.convert(self.cmp, self.rewardSet, self.phi) rewardCandNum = len(self.rewardSet) if self.queryType == QueryType.DEMONSTRATION: k = config.NUMBER_OF_RESPONSES else: raise Exception("query type not implemented") # now q is a set of TRAJECTORIES q = [] for i in range(k): if i == 0: args['maxV'] = [0] * rewardCandNum else: # find the optimal policy so far that achieves the best on each reward candidate args['maxV'] = [] for rewardId in xrange(rewardCandNum): args['maxV'].append( max([ self.computeV(pi, args['S'], args['A'], args['R'][rewardId], self.cmp.horizon) for pi in q ])) x = lp.milp(**args) if self.heuristic: #TODO what to do with this x for demonstration purpose pass q.append(self.sampleTrajectory(x)) objValue = self.getQValue(self.cmp.state, None, q) if self.queryType == QueryType.DEMONSTRATION: return q, None, objValue else: raise Exception("query type not implemented")
def learn(self): args = easyDomains.convert(self.cmp, self.rewardSet, self.phi) self.args = args # save a copy horizon = self.cmp.horizon terminalReward = self.cmp.terminalReward if self.queryType == QueryType.ACTION: k = len(args['A']) else: k = config.NUMBER_OF_RESPONSES # now q is a set of policy queries bestQ = None bestEUS = -numpy.inf # keep a copy of currently added policies. may not be used. # note that this is passing by inference # start with the prior optimal policy q = [self.getFiniteVIAgent(self.phi, horizon, terminalReward, posterior=True).x] args['q'] = q objValue = None # k won't be 1, fine # start adding following policies for i in range(1, k): if config.VERBOSE: print 'iter.', i x = self.findNextPolicy(**args) q.append(x) # query iteration # for each x \in q, what is q -> x; \psi? replace x with the optimal posterior policy if self.qi: q, objValue = self.queryIteration(args, q) args['q'] = q if self.queryType == QueryType.POLICY: # if asking policies directly, then return q #return q, objValue # THIS RETURNS EUS, NOT EPU return q, objValue if self.queryType == QueryType.PARTIAL_POLICY: idx = 0 objValue = self.getQValue(self.cmp.state, None, q) qP = copy.copy(q) while True: # iterate over all the policies, remove one state pair of each # but make sure the EUS of the new set is unchaged x = qP[idx] xOld = x.copy() success = False for key in util.randomly(x.keys()): x.pop(key) print self.getQValue(self.cmp.state, None, qP), objValue if self.getQValue(self.cmp.state, None, qP) == objValue: success = True break else: x = xOld.copy() if not success: break #print idx, len(x) idx = (idx + 1) % len(q) return qP elif self.queryType == QueryType.DEMONSTRATION: # if we already build a set of policies, but the query type is demonstration # we sample trajectories from these policies as a query # note that another way is implemented in MILPDemoAgent, which choose the next policy based on the demonstrated trajectories. qu = [self.sampleTrajectory(x) for x in q] return qu elif self.queryType in [QueryType.SIMILAR, QueryType.ACTION]: # implemented in a subclass, do nothing here pass else: raise Exception('Query type not implemented for MILP.') return args, q
def learn(self): args = easyDomains.convert(self.cmp, self.rewardSet, self.phi) k = config.NUMBER_OF_RESPONSES responseTime = self.cmp.getResponseTime() horizon = self.cmp.horizon terminalReward = self.cmp.terminalReward from itertools import combinations rewardCandNum = len(self.rewardSet) maxObjValue = -numpy.inf optQ = None optPsis = None values = util.Counter() for i in xrange(1, rewardCandNum + 1): for l in combinations(range(rewardCandNum), i): l = [self.phi[i] if i in l else 0 for i in range(rewardCandNum)] if config.VERBOSE: print l agent = self.getFiniteVIAgent(l, horizon - responseTime, terminalReward, posterior=True) values[tuple(l)] = agent.getValue(self.cmp.state) for subset in combinations(values.items(), k): psis = map(lambda _: _[0], subset) qs = map(lambda _: _[1], subset) # make sure that such query partitions the reward candiates if sum(sum(_ > 0 for _ in psi) for psi in psis) == rewardCandNum and\ all(sum(psi[i] for psi in psis) > 0 for i in xrange(rewardCandNum)): objValue = sum(qs) if objValue > maxObjValue: maxObjValue = objValue optQ = qs optPsis = psis q = None if self.queryType == QueryType.POLICY: return q, maxObjValue elif self.queryType == QueryType.ACTION: hList = [] # FIXME has a problem here! policyBins = self.computeDominatingPis(args, q) for s in args['S']: hValue = 0 for a in args['A']: resProb = 0 bins = [0] * len(q) for idx in xrange(rewardCandNum): if a in self.viAgentSet[idx].getPolicies(s): # increase the probability of observing this resProb += self.phi[idx] # put opt policies into bins bins = [sum(_) for _ in zip(bins, policyBins[idx])] hValue += resProb * scipy.stats.entropy(bins) hList.append((s, hValue)) # sort them nondecreasingly hList = filter(lambda _: not scipy.isnan(_[1]), hList) hList = sorted(hList, key=lambda _: _[1]) hList = hList[:1] else: raise Exception('Query type not implemented for MILP.') qList = [] for q, h in hList: # FIXME ignored transient phase qValue = self.getQValue(self.cmp.state, None, q) qList.append((q, None, qValue)) maxQValue = max(map(lambda _:_[2], qList)) qList = filter(lambda _: _[2] == maxQValue, qList) return random.choice(qList)
def learn(self): args = easyDomains.convert(self.cmp, self.rewardSet, self.phi) k = config.NUMBER_OF_RESPONSES responseTime = self.cmp.getResponseTime() horizon = self.cmp.horizon terminalReward = self.cmp.terminalReward from itertools import combinations rewardCandNum = len(self.rewardSet) maxObjValue = -numpy.inf optQ = None optPsis = None values = util.Counter() for i in xrange(1, rewardCandNum + 1): for l in combinations(range(rewardCandNum), i): l = [ self.phi[i] if i in l else 0 for i in range(rewardCandNum) ] if config.VERBOSE: print l agent = self.getFiniteVIAgent(l, horizon - responseTime, terminalReward, posterior=True) values[tuple(l)] = agent.getValue(self.cmp.state) for subset in combinations(values.items(), k): psis = map(lambda _: _[0], subset) qs = map(lambda _: _[1], subset) # make sure that such query partitions the reward candiates if sum(sum(_ > 0 for _ in psi) for psi in psis) == rewardCandNum and\ all(sum(psi[i] for psi in psis) > 0 for i in xrange(rewardCandNum)): objValue = sum(qs) if objValue > maxObjValue: maxObjValue = objValue optQ = qs optPsis = psis q = None if self.queryType == QueryType.POLICY: return q, maxObjValue elif self.queryType == QueryType.ACTION: hList = [] # FIXME has a problem here! policyBins = self.computeDominatingPis(args, q) for s in args['S']: hValue = 0 for a in args['A']: resProb = 0 bins = [0] * len(q) for idx in xrange(rewardCandNum): if a in self.viAgentSet[idx].getPolicies(s): # increase the probability of observing this resProb += self.phi[idx] # put opt policies into bins bins = [sum(_) for _ in zip(bins, policyBins[idx])] hValue += resProb * scipy.stats.entropy(bins) hList.append((s, hValue)) # sort them nondecreasingly hList = filter(lambda _: not scipy.isnan(_[1]), hList) hList = sorted(hList, key=lambda _: _[1]) hList = hList[:1] else: raise Exception('Query type not implemented for MILP.') qList = [] for q, h in hList: # FIXME ignored transient phase qValue = self.getQValue(self.cmp.state, None, q) qList.append((q, None, qValue)) maxQValue = max(map(lambda _: _[2], qList)) qList = filter(lambda _: _[2] == maxQValue, qList) return random.choice(qList)
def learn(self): args = easyDomains.convert(self.cmp, self.rewardSet, self.phi) self.args = args # save a copy horizon = self.cmp.horizon terminalReward = self.cmp.terminalReward if self.queryType == QueryType.ACTION: k = len(args['A']) else: k = config.NUMBER_OF_RESPONSES # now q is a set of policy queries bestQ = None bestEUS = -numpy.inf # keep a copy of currently added policies. may not be used. # note that this is passing by inference # start with the prior optimal policy q = [ self.getFiniteVIAgent(self.phi, horizon, terminalReward, posterior=True).x ] args['q'] = q objValue = None # k won't be 1, fine # start adding following policies for i in range(1, k): if config.VERBOSE: print 'iter.', i x = self.findNextPolicy(**args) q.append(x) # query iteration # for each x \in q, what is q -> x; \psi? replace x with the optimal posterior policy if self.qi: q, objValue = self.queryIteration(args, q) args['q'] = q if self.queryType == QueryType.POLICY: # if asking policies directly, then return q #return q, objValue # THIS RETURNS EUS, NOT EPU return q, objValue if self.queryType == QueryType.PARTIAL_POLICY: idx = 0 objValue = self.getQValue(self.cmp.state, None, q) qP = copy.copy(q) while True: # iterate over all the policies, remove one state pair of each # but make sure the EUS of the new set is unchaged x = qP[idx] xOld = x.copy() success = False for key in util.randomly(x.keys()): x.pop(key) print self.getQValue(self.cmp.state, None, qP), objValue if self.getQValue(self.cmp.state, None, qP) == objValue: success = True break else: x = xOld.copy() if not success: break #print idx, len(x) idx = (idx + 1) % len(q) return qP elif self.queryType == QueryType.DEMONSTRATION: # if we already build a set of policies, but the query type is demonstration # we sample trajectories from these policies as a query # note that another way is implemented in MILPDemoAgent, which choose the next policy based on the demonstrated trajectories. qu = [self.sampleTrajectory(x) for x in q] return qu elif self.queryType in [QueryType.SIMILAR, QueryType.ACTION]: # implemented in a subclass, do nothing here pass else: raise Exception('Query type not implemented for MILP.') return args, q