def simulate_givenstart(model, agent, s, t_max): ''' Simulate an MDP for t_max timesteps or until the a terminal state is reached. Returns a list [ (s_0, a_0, r_0), (s_1, a_1, r_1), ...] ''' # s = util.functions.sample(initial) result = [] t = 0 while t < t_max and not model.is_terminal(s): a = agent.sample(s) s_p = util.functions.sample(model.T(s, a)) r = model.R(s, a) result.append((s, a, r)) s = s_p t += 1 if model.is_terminal(s): a = agent.sample(s) r = model.R(s, a) result.append((s, a, r)) return result
def iter(cls, model, Q): V = util.classes.NumMap() # Compute V(s) = max_{a} Q(s,a) for s in model.S(): V_s = util.classes.NumMap() for a in model.A(s): V_s[a] = Q[ (s,a) ] if len(V_s) > 0: V[s] = V_s.max() else: V[s] = 0.0 # QQ(s,a) = R(s,a) + gamma*sum_{s'} T(s,a,s')*V(s') QQ = util.classes.NumMap() for s in model.S(): for a in model.A(s): value = model.R(s,a) T = model.T(s,a) value += sum( [model.gamma*t*V[s_prime] for (s_prime,t) in T.items()] ) QQ[ (s,a) ] = value # to find the log policy, find the argmax at each state and then create a new Q with each (s,a) = oldQ - (max for that state) return QQ
def sample_model(model, n_samples, distr, agent): ''' sample states (s,a,r,s') where s sampled from distribution returns [(s_0,a_0,r_0,s_p_0), (s_1,a_1,r_1,s_p_1),...] ''' result = [] for i in range(n_samples): s = util.functions.sample(distr) a = agent.sample(s) r = model.R(s, a) s_p = util.functions.sample(model.T(s, a)) result.append((s, a, r, s_p)) return result
def QValueSoftMaxSolve(model, thresh = 1): v = util.classes.NumMap() for s in model.S(): v[s] = 0.0 diff = 100.0 while diff >= thresh: vp = v Q = util.classes.NumMap() for s in model.S(): for a in model.A(s): value = model.R(s,a) T = model.T(s,a) value += sum( [model.gamma*t*v[s_prime] for (s_prime,t) in T.items()] ) Q[ (s,a) ] = value v = util.classes.NumMap() # need the max action for each state! for s in model.S(): maxx = None for a in model.A(s): if (maxx == None) or Q[(s,a)] > maxx: maxx = Q[(s,a)] e_sum = 0 for a in model.A(s): e_sum += math.exp(Q[(s,a)] - maxx) v[s] = maxx + math.log(e_sum) diff = max(abs(value - vp[s]) for (s, value) in v.iteritems()) logp = util.classes.NumMap() for (sa, value) in Q.iteritems(): logp[sa] = value - v[sa[0]] return logp
def iter(cls, model, Q): V = util.classes.NumMap() # Compute V(s) = max_{a} Q(s,a) for s in model.S(): V_s = util.classes.NumMap() for a in model.A(s): V_s[a] = Q[ (s,a) ] if len(V_s) > 0: V[s] = V_s.max() else: V[s] = 0.0 # QQ(s,a) = R(s,a) + gamma*sum_{s'} T(s,a,s')*V(s') QQ = util.classes.NumMap() for s in model.S(): for a in model.A(s): value = model.R(s,a) T = model.T(s,a) value += sum( [model.gamma*t*V[s_prime] for (s_prime,t) in T.items()] ) QQ[ (s,a) ] = value return QQ
def multi_simulate(model, policies, initials, t_max, interactionlength): # policies = [ policy1, policy2, equilibrium1, equilibrium2 ] result = [] Ss = [] for initial in initials: Ss.append(util.functions.sample(initial)) result.append([]) t = 0 atTerminal = False interactionCooldown = [-1 for i in range(len(policies))] while t < t_max and not atTerminal: actions = [] for i in initials: actions.append(None) if not policies[2] == None and not policies[ 3] == None and interactionCooldown[ 0] < 0 and interactionCooldown[1] < 0: for (i, s) in enumerate(Ss): for (j, s2) in enumerate(Ss): if not i == j: if s2.conflicts(s) or ( t > 0 and (result[i][t - 1][0].conflicts(s2) or s.conflicts(result[j][t - 1][0]))): interactionCooldown[0] = interactionlength interactionCooldown[1] = interactionlength for (i, a) in enumerate(actions): if interactionCooldown[i] <= 0: actions[i] = policies[i].sample(Ss[i]) elif interactionCooldown[i] > 1: actions[i] = patrol.model.PatrolActionStop() else: actions[i] = util.functions.sample(policies[2 + i]) if actions[i].__class__.__name__ == "PatrolActionMoveForward": actions[i] = policies[i].sample(Ss[i]) interactionCooldown[i] = interactionCooldown[i] - 1 for (i, a) in enumerate(actions): # r = model.R(Ss[i],actions[i]) result[i].append((Ss[i], actions[i])) Ss[i] = util.functions.sample(model.T(Ss[i], actions[i])) if model.is_terminal(Ss[i]): atTerminal = True t += 1 if atTerminal: for (i, s) in Ss: a = policies[i].sample(s) r = model.R(s, a) result[i].append((s, a, r)) return result