def __init__(self, max_iter, timeout, log=0, pref=True): self.max_iter = max_iter params['timeout'] = timeout params['log'] = log if not pref: params["prefs"] = False self.h = History() self.last_action = POMDPAction() self.first = True
class TestTree(unittest.TestCase): def setUp(self): self.pomdp = Tiger() self.start = State(LEFT) # tiger behind left door self.o = Observation() self.h = History() self.a = POMDPAction() def test_create_node(self): node = create_node(self.h, self.a, self.o) self.assertTrue(isinstance(node, Node)) self.assertEqual(self.a, node.a) self.assertEqual(len(self.h), 1) self.assertFalse(node.inTree) def test_create_children(self): node = create_node(self.h, self.a, self.o) node.create_children() self.assertEqual(3, len(node.children)) for act, child in node.children.items(): self.assertFalse(child.inTree) def test_is_in_tree(self): # setup a tree with root and its depth 1 children root = create_node(self.h, self.a, self.o) root.inTree = True root.create_children() for act, child in root.children.items(): obs, r = child.a.do_on(self.start.clone()) child.h.add(child.a, obs) child.inTree = True h = root.h.clone() a = Action(listen=True) s = self.start.clone() o, r = a.do_on(s) h.add(a, o) self.assertTrue(root.is_intree(h)) h2 = h.clone() h2.add(a, o) self.assertFalse(root.is_intree(h2)) def test_pref_actions(self): self.h.add(self.a, self.o) a = Action(listen=True) s = self.start.clone() o, r = a.do_on(s) self.h.add(a, o) a2 = Action(direction=LEFT) o2, r = a2.do_on(s) node = create_node(self.h, a2, o2) self.assertEqual(12, node.V) self.assertEqual(5, node.N)
def setUp(self): self.pomdp = Tiger() self.start = State(LEFT) # tiger behind left door o = Observation() h = History() a = POMDPAction() self.root = create_node(h, a, o) self.root.inTree = True params['c'] = 2 # setup a tree with root and its depth-1 children self.root.create_children() # expand listen-node a = Action(listen=True) s = self.start.clone() o, r = a.do_on(s) self.listen_child = self.root.children[a] self.listen_child.h.add(a, o) self.listen_child.inTree = True self.listen_child.create_children() # There is 3 depth-2 nodes, their histories go like: # empty-listen-listen (ell), empty-listen-left (elf), empty-listen-right (elr) # elr should have highest V (pref action) for act, child in self.listen_child.children.items(): obs, r = child.a.do_on(s.clone()) self.listen_child.children[act] = create_node(child.h, act, obs) self.listen_child.children[act].inTree = True
def test_simulate_expansion(self): root = create_node(History(), POMDPAction(), Observation()) #self.root1 = root params.update({ 'start_time': time.time(), 'gamma': 0.5, 'epsilon': 0.2, 'max_depth': 100, 'timeout': 3 }) simulate(self.start, root) self.assertEqual(len(root.children), 3) self.assertEqual(root.N, 1)
class MCPlayer(AbstractPlayer): def __init__(self, max_iter, timeout, log=0, pref=True): self.max_iter = max_iter params['timeout'] = timeout params['log'] = log if not pref: params["prefs"] = False self.h = History() self.last_action = POMDPAction() self.first = True def next_action(self, state): # init domain knowledge if self.first: self.dom_kno = Minesweeper(state.board.h, state.board.w, state.board.m) #self.first = False # update history with last action - observation o = Observation(state.board.clone().knowledge, state.board.m) self.h.add(self.last_action, o) #print(self.h) # launch UCT to select next best action based on current history a = search(self.h.clone(), self.dom_kno, self.max_iter, clean=self.first) if self.first: self.first = False self.last_action = a assert isinstance(a, Action) return a.cell def reset(self): self.h = History() self.last_action = POMDPAction() self.first = True
def run_policy(self, policy_param, num_episodes): world = self.Env() policy = self.Pol policy.parameters = policy_param J_hats = [] steps = [] states = [] actions = [] rewards = [] for n in range(num_episodes): ret = 0 step = 0 states.append([]) actions.append([]) rewards.append([]) while not world.isEnd: states[n].append(world.state) step += 1 action = policy.sampleAction(world.state) world.step(action) discounted_reward = world.reward #* world.gamma**step ret += discounted_reward actions[n].append(action) rewards[n].append(discounted_reward) J_hats.append(ret) self.results.append(ret) steps.append(step) world.reset() D = [] for i in range(num_episodes): history = History(states[i], actions[i], rewards[i]) D.append(history) return np.mean(J_hats), D, np.max(J_hats)
class TestHistory(unittest.TestCase): def setUp(self): self.b = Board(4, 5, 3) self.s = State(self.b) self.h = History() def test_add(self): a = Action(0, 0) o, r = a.do_on(self.s) self.h.add(a, o) a2 = Action(2, 1) o2, r2 = a2.do_on(self.s) self.h.add(a2, o2) self.assertEqual(self.h.last_action(), a2) #print(o2) #print(self.h.last_obs()) self.assertEqual(self.h.last_obs(), o2) def test_clone(self): a = Action(1, 0) o, r = a.do_on(self.s) self.h.add(a, o) h = self.h.clone() self.assertEqual(h, self.h)
def reset(self): self.h = History() self.last_action = POMDPAction() self.first = True
np.random.seed(int(time.time())) # Load data from CSV state_dimension, num_actions, iOrder, theta_b, data_size, states, actions, rewards, pi_b = import_data( "../data/test_data/data.csv") print("Loaded data.") print(state_dimension, num_actions, iOrder, theta_b, data_size) D = [] # Parameter to select c = 2 dOrder = 0 iteration_to_run = 1000 # Create Data set for i in range(data_size): history = History(states[i], actions[i], rewards[i]) D.append(history) # Shuffle the data np.random.shuffle(D) D = D[:actual_data_size] slog(len(D)) J_pi_b_hat = 0.0 for H in D: J_pi_b_hat += np.sum(H.rewards) J_pi_b_hat /= len(D) slog(J_pi_b_hat) policy = FBSoftmax(state_dimension, num_actions, iOrder, dOrder)
import os params = { 'K': 50, # number of particles (size of the belief state space) 'c': 0, # exploration / exploitation ratio scalar constant (domain specific) 'epsilon': 0.0, # history discount factor 'gamma': 1, # reward discount factor 'R_lo': 0, # lowest value V(h) reached 'R_hi': 1, # highest value V(h) reached 'timeout': 120, # timeout for each iteration in seconds 'start_time': 0, # start time in seconds 'max_depth': 20, # max depth 'log': 1, # level of logs printed on console [0,2] 'prefs': True, # enable/disable prefered actions 'root': Node(POMDPAction(), History(), 0, 0, list()) } # start time start_time = 0 def UCB1_action_selection(node, greedy=False): """ Implementation of the UCB1 algorithm for solving a multi-armed bandit problem. https://homes.di.unimi.it/~cesabian/Pubblicazioni/ml-02.pdf Each action a available from the history h are assigned a value V(ha), computed from simulations of the POMDP from the history h. In non-greedy mode, this value is augmented by an exploration bonus for rarely-tried actions.
def setUp(self): self.pomdp = Tiger() self.start = State(LEFT) # tiger behind left door self.o = Observation() self.h = History() self.a = POMDPAction()
def setUp(self): self.b = Board(4, 5, 3) self.s = State(self.b) self.h = History()