示例#1
0
 def __init__(self, max_iter, timeout, log=0, pref=True):
     self.max_iter = max_iter
     params['timeout'] = timeout
     params['log'] = log
     if not pref:
         params["prefs"] = False
     self.h = History()
     self.last_action = POMDPAction()
     self.first = True
示例#2
0
class TestTree(unittest.TestCase):
    def setUp(self):
        self.pomdp = Tiger()
        self.start = State(LEFT)  # tiger behind left door
        self.o = Observation()
        self.h = History()
        self.a = POMDPAction()

    def test_create_node(self):
        node = create_node(self.h, self.a, self.o)
        self.assertTrue(isinstance(node, Node))
        self.assertEqual(self.a, node.a)
        self.assertEqual(len(self.h), 1)
        self.assertFalse(node.inTree)

    def test_create_children(self):
        node = create_node(self.h, self.a, self.o)
        node.create_children()
        self.assertEqual(3, len(node.children))
        for act, child in node.children.items():
            self.assertFalse(child.inTree)

    def test_is_in_tree(self):
        # setup a tree with root and its depth 1 children
        root = create_node(self.h, self.a, self.o)
        root.inTree = True
        root.create_children()
        for act, child in root.children.items():
            obs, r = child.a.do_on(self.start.clone())
            child.h.add(child.a, obs)
            child.inTree = True

        h = root.h.clone()
        a = Action(listen=True)
        s = self.start.clone()
        o, r = a.do_on(s)
        h.add(a, o)
        self.assertTrue(root.is_intree(h))
        h2 = h.clone()
        h2.add(a, o)
        self.assertFalse(root.is_intree(h2))

    def test_pref_actions(self):
        self.h.add(self.a, self.o)
        a = Action(listen=True)
        s = self.start.clone()
        o, r = a.do_on(s)
        self.h.add(a, o)
        a2 = Action(direction=LEFT)
        o2, r = a2.do_on(s)
        node = create_node(self.h, a2, o2)
        self.assertEqual(12, node.V)
        self.assertEqual(5, node.N)
示例#3
0
    def setUp(self):
        self.pomdp = Tiger()
        self.start = State(LEFT)  # tiger behind left door
        o = Observation()
        h = History()
        a = POMDPAction()
        self.root = create_node(h, a, o)
        self.root.inTree = True

        params['c'] = 2
        # setup a tree with root and its depth-1 children
        self.root.create_children()
        # expand listen-node
        a = Action(listen=True)
        s = self.start.clone()
        o, r = a.do_on(s)
        self.listen_child = self.root.children[a]
        self.listen_child.h.add(a, o)
        self.listen_child.inTree = True
        self.listen_child.create_children()
        # There is 3 depth-2 nodes, their histories go like:
        # empty-listen-listen (ell), empty-listen-left (elf), empty-listen-right (elr)
        # elr should have highest V (pref action)
        for act, child in self.listen_child.children.items():
            obs, r = child.a.do_on(s.clone())
            self.listen_child.children[act] = create_node(child.h, act, obs)
            self.listen_child.children[act].inTree = True
示例#4
0
 def test_simulate_expansion(self):
     root = create_node(History(), POMDPAction(), Observation())
     #self.root1 = root
     params.update({
         'start_time': time.time(),
         'gamma': 0.5,
         'epsilon': 0.2,
         'max_depth': 100,
         'timeout': 3
     })
     simulate(self.start, root)
     self.assertEqual(len(root.children), 3)
     self.assertEqual(root.N, 1)
示例#5
0
class MCPlayer(AbstractPlayer):
    def __init__(self, max_iter, timeout, log=0, pref=True):
        self.max_iter = max_iter
        params['timeout'] = timeout
        params['log'] = log
        if not pref:
            params["prefs"] = False
        self.h = History()
        self.last_action = POMDPAction()
        self.first = True

    def next_action(self, state):
        # init domain knowledge
        if self.first:
            self.dom_kno = Minesweeper(state.board.h, state.board.w,
                                       state.board.m)
            #self.first = False
        # update history with last action - observation
        o = Observation(state.board.clone().knowledge, state.board.m)
        self.h.add(self.last_action, o)
        #print(self.h)
        # launch UCT to select next best action based on current history
        a = search(self.h.clone(),
                   self.dom_kno,
                   self.max_iter,
                   clean=self.first)
        if self.first:
            self.first = False
        self.last_action = a
        assert isinstance(a, Action)
        return a.cell

    def reset(self):
        self.h = History()
        self.last_action = POMDPAction()
        self.first = True
    def run_policy(self, policy_param, num_episodes):

        world = self.Env()
        policy = self.Pol
        policy.parameters = policy_param

        J_hats = []
        steps = []
        states = []
        actions = []
        rewards = []

        for n in range(num_episodes):
            ret = 0
            step = 0

            states.append([])
            actions.append([])
            rewards.append([])

            while not world.isEnd:
                states[n].append(world.state)

                step += 1
                action = policy.sampleAction(world.state)
                world.step(action)
                discounted_reward = world.reward  #* world.gamma**step
                ret += discounted_reward

                actions[n].append(action)
                rewards[n].append(discounted_reward)

            J_hats.append(ret)
            self.results.append(ret)
            steps.append(step)
            world.reset()

        D = []

        for i in range(num_episodes):
            history = History(states[i], actions[i], rewards[i])
            D.append(history)

        return np.mean(J_hats), D, np.max(J_hats)
示例#7
0
class TestHistory(unittest.TestCase):
    def setUp(self):
        self.b = Board(4, 5, 3)
        self.s = State(self.b)
        self.h = History()

    def test_add(self):
        a = Action(0, 0)
        o, r = a.do_on(self.s)
        self.h.add(a, o)
        a2 = Action(2, 1)
        o2, r2 = a2.do_on(self.s)
        self.h.add(a2, o2)
        self.assertEqual(self.h.last_action(), a2)
        #print(o2)
        #print(self.h.last_obs())
        self.assertEqual(self.h.last_obs(), o2)

    def test_clone(self):
        a = Action(1, 0)
        o, r = a.do_on(self.s)
        self.h.add(a, o)
        h = self.h.clone()
        self.assertEqual(h, self.h)
示例#8
0
 def reset(self):
     self.h = History()
     self.last_action = POMDPAction()
     self.first = True
示例#9
0
    np.random.seed(int(time.time()))
    # Load data from CSV
    state_dimension, num_actions, iOrder, theta_b, data_size, states, actions, rewards, pi_b = import_data(
        "../data/test_data/data.csv")
    print("Loaded data.")
    print(state_dimension, num_actions, iOrder, theta_b, data_size)
    D = []

    # Parameter to select
    c = 2
    dOrder = 0
    iteration_to_run = 1000

    # Create Data set
    for i in range(data_size):
        history = History(states[i], actions[i], rewards[i])
        D.append(history)

    # Shuffle the data
    np.random.shuffle(D)

    D = D[:actual_data_size]
    slog(len(D))

    J_pi_b_hat = 0.0
    for H in D:
        J_pi_b_hat += np.sum(H.rewards)
    J_pi_b_hat /= len(D)
    slog(J_pi_b_hat)

    policy = FBSoftmax(state_dimension, num_actions, iOrder, dOrder)
示例#10
0
import os

params = {
    'K': 50,  # number of particles (size of the belief state space)
    'c':
    0,  # exploration / exploitation ratio scalar constant (domain specific)
    'epsilon': 0.0,  # history discount factor
    'gamma': 1,  # reward discount factor
    'R_lo': 0,  # lowest value V(h) reached 
    'R_hi': 1,  # highest value V(h) reached
    'timeout': 120,  # timeout for each iteration in seconds
    'start_time': 0,  # start time in seconds
    'max_depth': 20,  # max depth
    'log': 1,  # level of logs printed on console [0,2]
    'prefs': True,  # enable/disable prefered actions
    'root': Node(POMDPAction(), History(), 0, 0, list())
}

# start time
start_time = 0


def UCB1_action_selection(node, greedy=False):
    """
    Implementation of the UCB1 algorithm for solving a multi-armed bandit problem.

    https://homes.di.unimi.it/~cesabian/Pubblicazioni/ml-02.pdf

    Each action a available from the history h are assigned a value V(ha), 
    computed from simulations of the POMDP from the history h.
    In non-greedy mode, this value is augmented by an exploration bonus for rarely-tried actions.
示例#11
0
 def setUp(self):
     self.pomdp = Tiger()
     self.start = State(LEFT)  # tiger behind left door
     self.o = Observation()
     self.h = History()
     self.a = POMDPAction()
示例#12
0
 def setUp(self):
     self.b = Board(4, 5, 3)
     self.s = State(self.b)
     self.h = History()