Python ActionHandler示例，reinforcepy.handlers.ActionHandler Python示例

示例#1

0

显示文件

文件： test_actionhandler.py 项目： tonylibing/reinforcepy

def test_set_legal_actions(action_handler: ActionHandler):
    # test to make sure action raises error on matrix input
    with pytest.raises(AssertionError):
        action_handler.set_legal_actions([[0, 2, 4, 6]])

    action_handler.set_legal_actions([0, 2, 4, 6])
    assert action_handler.numActions == 4

示例#2

0

显示文件

    def __init__(self, learner_parms, network, batch_size=32, testing=False):
        # set required parameters
        learner_parms.required([
            'skip_frame', 'egreedy_policy', 'dataset_shape',
            'max_dataset_size', 'phi_length', 'minimum_replay_size',
            'minibatch_size'
        ])

        # initialize action handler
        rand_vals = learner_parms.get(
            'egreedy_policy')  # starting at 1 anneal eGreedy policy to 0.1
        self.action_handler = ActionHandler(0, rand_vals)

        # set cnn to passed in network
        self.cnn = network

        # initialize experience replay
        dataset_shape = learner_parms.get('dataset_shape')
        self.exp_replay = DataSet(
            dataset_shape['width'],
            dataset_shape['height'],
            max_steps=learner_parms.get('max_dataset_size'),
            phi_length=learner_parms.get('phi_length'))
        self.minimum_replay_size = learner_parms.get('minimum_replay_size')

        # initialize other vars
        self.batch_size = batch_size
        self.skip_frame = learner_parms.get('skip_frame')
        self.step_count = 0

        self.testing = testing

示例#3

0

显示文件

文件： test_actionhandler.py 项目： Islandman93/reinforcepy

def test_set_legal_actions(action_handler: ActionHandler):
    # test to make sure action raises error on matrix input
    with pytest.raises(AssertionError):
        action_handler.set_legal_actions([[0, 2, 4, 6]])

    action_handler.set_legal_actions([0, 2, 4, 6])
    assert action_handler.numActions == 4

示例#4

0

显示文件

文件： test_actionhandler.py 项目： tonylibing/reinforcepy

def test_get_action(action_handler: ActionHandler):
    action_ind = action_handler.get_action([1, 0, 0, 0], random=False)
    assert isinstance(action_ind, np.integer), "expected int got {}".format(
        type(action_ind))
    assert action_ind == 0

    action_handler.get_action([1, 0, 0,
                               0])  # just make sure random doesn't fail

示例#5

0

显示文件

文件： test_actionhandler.py 项目： Islandman93/reinforcepy

def test_get_random(action_handler: ActionHandler):
    # reset curr rand val
    action_handler.curr_rand_val = 1

    # should be random action
    random, action = action_handler.get_random()
    assert random is True
    assert action in [0, 2, 4, 6]

    # test shouldn't be random
    action_handler.curr_rand_val = 0
    random, action = action_handler.get_random()
    assert random is False
    assert action is None

示例#6

0

显示文件

文件： test_actionhandler.py 项目： tonylibing/reinforcepy

def test_get_random(action_handler: ActionHandler):
    # reset curr rand val
    action_handler.curr_rand_val = 1

    # should be random action
    random, action = action_handler.get_random()
    assert random is True
    assert action in [0, 2, 4, 6]

    # test shouldn't be random
    action_handler.curr_rand_val = 0
    random, action = action_handler.get_random()
    assert random is False
    assert action is None

示例#7

0

显示文件

文件： dqn.py 项目： Islandman93/reinforcepy

    def __init__(self, learner_parms, network, batch_size=32, testing=False):
        # set required parameters
        learner_parms.required(['skip_frame', 'egreedy_policy', 'dataset_shape', 'max_dataset_size', 'phi_length',
                                'minimum_replay_size', 'minibatch_size'])

        # initialize action handler
        rand_vals = learner_parms.get('egreedy_policy')  # starting at 1 anneal eGreedy policy to 0.1
        self.action_handler = ActionHandler(0, rand_vals)

        # set cnn to passed in network
        self.cnn = network

        # initialize experience replay
        dataset_shape = learner_parms.get('dataset_shape')
        self.exp_replay = DataSet(dataset_shape['width'], dataset_shape['height'],
                                  max_steps=learner_parms.get('max_dataset_size'),
                                  phi_length=learner_parms.get('phi_length'))
        self.minimum_replay_size = learner_parms.get('minimum_replay_size')

        # initialize other vars
        self.batch_size = batch_size
        self.skip_frame = learner_parms.get('skip_frame')
        self.step_count = 0

        self.testing = testing

示例#8

0

显示文件

文件： base_thread_learner.py 项目： Islandman93/reinforcepy

    def __init__(self, environment, network, global_dict, phi_length=4,
                 async_update_step=5, reward_clip_vals=[-1, 1], random_policy=True, epsilon_annealing_start=1,
                 epsilon_annealing_choices=[0.1, 0.01, 0.5], epsilon_annealing_probabilities=[0.4, 0.3, 0.3],
                 epsilon_annealing_steps=1000000, global_epsilon_annealing=True,
                 testing=False):
        super().__init__()

        # If doing a random policy (E-greedy)
        self.random_policy = random_policy
        if random_policy:
            # initialize action handler, ending E-greedy is either 0.1, 0.01, 0.5 with probability 0.4, 0.3, 0.3
            end_rand = np.random.choice(epsilon_annealing_choices, p=epsilon_annealing_probabilities)
            rand_vals = (epsilon_annealing_start, end_rand, epsilon_annealing_steps)
            self.action_handler = ActionHandler(environment.get_num_actions(), rand_vals)  # we set num actions later
        self.step_count = 0
        self.environment = environment
        self.reward_clip_vals = reward_clip_vals

        # network stuff
        self.network = network

        self.phi_length = phi_length
        self.frame_buffer = FrameBuffer([1, phi_length] + environment.get_state_shape())

        self.async_update_step = async_update_step
        self.global_dict = global_dict
        self.global_epsilon_annealing = global_epsilon_annealing

        self.minibatch_vars = {}
        self.reset_minibatch()

        self.testing = testing

示例#9

0

显示文件

文件： AsyncA3CLearner.py 项目： pratikgadiya12/reinforcepy

    def __init__(self, num_actions, initial_cnn_values, cnn_partial, pipe,
                 skip_frame=4, phi_length=4, async_update_step=5):
        super().__init__(pipe)

        # A3C doesn't have an EGreedy exploration policy so we set the random values to 0
        self.action_handler = ActionHandler((0, 0, 2))

        # initialize network
        self.cnn = cnn_partial()
        self.cnn.set_parameters(initial_cnn_values)
        self.frame_buffer = np.zeros((1, phi_length, 84, 84), dtype=np.float32)

        self.skip_frame = skip_frame
        self.phi_length = phi_length
        self.loss_list = list()

        self.async_update_step = async_update_step

示例#10

0

显示文件

文件： base_thread_learner.py 项目： tonylibing/reinforcepy

    def __init__(self,
                 environment,
                 network,
                 global_dict,
                 phi_length=4,
                 async_update_step=5,
                 reward_clip_vals=[-1, 1],
                 random_policy=True,
                 epsilon_annealing_start=1,
                 epsilon_annealing_choices=[0.1, 0.01, 0.5],
                 epsilon_annealing_probabilities=[0.4, 0.3, 0.3],
                 epsilon_annealing_steps=1000000,
                 global_epsilon_annealing=True,
                 testing=False):
        super().__init__()

        # If doing a random policy (E-greedy)
        self.random_policy = random_policy
        if random_policy:
            # initialize action handler, ending E-greedy is either 0.1, 0.01, 0.5 with probability 0.4, 0.3, 0.3
            end_rand = np.random.choice(epsilon_annealing_choices,
                                        p=epsilon_annealing_probabilities)
            rand_vals = (epsilon_annealing_start, end_rand,
                         epsilon_annealing_steps)
            self.action_handler = ActionHandler(
                environment.get_num_actions(),
                rand_vals)  # we set num actions later
        self.step_count = 0
        self.environment = environment
        self.reward_clip_vals = reward_clip_vals

        # network stuff
        self.network = network

        self.phi_length = phi_length
        self.frame_buffer = FrameBuffer([1, phi_length] +
                                        environment.get_state_shape())

        self.async_update_step = async_update_step
        self.global_dict = global_dict
        self.global_epsilon_annealing = global_epsilon_annealing

        self.minibatch_vars = {}
        self.reset_minibatch()

        self.testing = testing

示例#11

0

显示文件

文件： AsyncA3CLearner.py 项目： pratikgadiya12/reinforcepy

class AsyncProcessA3CLearner(AsyncProcessClient):
    def __init__(self, num_actions, initial_cnn_values, cnn_partial, pipe,
                 skip_frame=4, phi_length=4, async_update_step=5):
        super().__init__(pipe)

        # A3C doesn't have an EGreedy exploration policy so we set the random values to 0
        self.action_handler = ActionHandler((0, 0, 2))

        # initialize network
        self.cnn = cnn_partial()
        self.cnn.set_parameters(initial_cnn_values)
        self.frame_buffer = np.zeros((1, phi_length, 84, 84), dtype=np.float32)

        self.skip_frame = skip_frame
        self.phi_length = phi_length
        self.loss_list = list()

        self.async_update_step = async_update_step

    def add_state_to_buffer(self, state):
        self.frame_buffer[0, 0:self.phi_length-1] = self.frame_buffer[0, 1:self.phi_length]
        self.frame_buffer[0, self.phi_length-1] = state

    def frame_buffer_with(self, state):
        empty_buffer = np.zeros((1, self.phi_length, 84, 84), dtype=np.float32)
        empty_buffer[0, 0:self.phi_length-1] = self.frame_buffer[0, 1:self.phi_length]
        empty_buffer[0, self.phi_length-1] = state
        return empty_buffer

    def get_action(self, frame_buffer):
        return self.cnn.get_policy_output(frame_buffer)[0]

    def get_game_action(self, frame_buffer):
        action = self.get_action(frame_buffer)
        return self.action_handler.action_vect_to_game_action(action, random=False)

    def set_legal_actions(self, legal_actions):
        self.action_handler.set_legal_actions(legal_actions)

示例#12

0

显示文件

文件： test_actionhandler.py 项目： tonylibing/reinforcepy

def test_anneal_to(action_handler: ActionHandler):
    # zero should be highest rand val
    action_handler.anneal_to(0)
    assert action_handler.curr_rand_val == 1

    # one should be in the middle
    action_handler.anneal_to(1)
    assert action_handler.curr_rand_val == 0.5

    # 2 and greater should be lowest
    action_handler.anneal_to(2)
    assert action_handler.curr_rand_val == 0
    assert action_handler.curr_rand_val == action_handler.lowest_rand_val

    action_handler.anneal_to(999)
    assert action_handler.curr_rand_val == 0
    assert action_handler.curr_rand_val == action_handler.lowest_rand_val

示例#13

0

显示文件

文件： test_actionhandler.py 项目： Islandman93/reinforcepy

def test_anneal_to(action_handler: ActionHandler):
    # zero should be highest rand val
    action_handler.anneal_to(0)
    assert action_handler.curr_rand_val == 1

    # one should be in the middle
    action_handler.anneal_to(1)
    assert action_handler.curr_rand_val == 0.5

    # 2 and greater should be lowest
    action_handler.anneal_to(2)
    assert action_handler.curr_rand_val == 0
    assert action_handler.curr_rand_val == action_handler.lowest_rand_val

    action_handler.anneal_to(999)
    assert action_handler.curr_rand_val == 0
    assert action_handler.curr_rand_val == action_handler.lowest_rand_val

示例#14

0

显示文件

文件： base_thread_learner.py 项目： tonylibing/reinforcepy

class BaseThreadLearner(threading.Thread):
    def __init__(self,
                 environment,
                 network,
                 global_dict,
                 phi_length=4,
                 async_update_step=5,
                 reward_clip_vals=[-1, 1],
                 random_policy=True,
                 epsilon_annealing_start=1,
                 epsilon_annealing_choices=[0.1, 0.01, 0.5],
                 epsilon_annealing_probabilities=[0.4, 0.3, 0.3],
                 epsilon_annealing_steps=1000000,
                 global_epsilon_annealing=True,
                 testing=False):
        super().__init__()

        # If doing a random policy (E-greedy)
        self.random_policy = random_policy
        if random_policy:
            # initialize action handler, ending E-greedy is either 0.1, 0.01, 0.5 with probability 0.4, 0.3, 0.3
            end_rand = np.random.choice(epsilon_annealing_choices,
                                        p=epsilon_annealing_probabilities)
            rand_vals = (epsilon_annealing_start, end_rand,
                         epsilon_annealing_steps)
            self.action_handler = ActionHandler(
                environment.get_num_actions(),
                rand_vals)  # we set num actions later
        self.step_count = 0
        self.environment = environment
        self.reward_clip_vals = reward_clip_vals

        # network stuff
        self.network = network

        self.phi_length = phi_length
        self.frame_buffer = FrameBuffer([1, phi_length] +
                                        environment.get_state_shape())

        self.async_update_step = async_update_step
        self.global_dict = global_dict
        self.global_epsilon_annealing = global_epsilon_annealing

        self.minibatch_vars = {}
        self.reset_minibatch()

        self.testing = testing

    def reset(self):
        self.reset_minibatch()
        self.frame_buffer.reset()

        # initialize the buffer with states
        # TODO: add random starts here
        state = self.environment.get_state()
        for _ in range(self.phi_length):
            self.frame_buffer.add_state_to_buffer(state)

    def run(self):
        while not self.global_dict['done']:
            reward = self.run_episode(self.environment)
            self.global_dict['add_reward'](reward)
            curr_rand_val = ''
            if self.random_policy:
                curr_rand_val = 'Curr Rand Val: {0}'.format(
                    self.action_handler.curr_rand_val)
            print(self, 'Episode reward:', reward, 'Steps:',
                  self.environment.curr_step_count, 'Step count:',
                  self.step_count, curr_rand_val)

    def update(self, *args, **kwargs):
        raise NotImplementedError(
            'Base onestep learner does not implement update.')

    def anneal_random_policy(self):
        if self.random_policy:
            # anneal action handler
            anneal_step = self.global_dict[
                'counter'] if self.global_epsilon_annealing else self.step_count
            self.action_handler.anneal_to(anneal_step)

    def get_action(self, state):
        """
        Gets an action for the current state. First queries action_handler to see
        if we should execute a random action. If random action, then don't send to gpu
        """
        if self.random_policy:
            # check if doing random action
            random, action = self.action_handler.get_random()
            if not random:
                return self.network.get_output(
                    self.frame_buffer.get_buffer_with(state))
            return action
        else:
            return self.network.get_output(
                self.frame_buffer.get_buffer_with(state))

    def reset_minibatch(self):
        pass

示例#15

0

显示文件

文件： test_actionhandler.py 项目： Islandman93/reinforcepy

def test_game_action_to_action_ind(action_handler: ActionHandler):
    action_ind = action_handler.game_action_to_action_ind(2)
    assert isinstance(action_ind, np.integer), "expected int got {}".format(type(action_ind))
    assert action_ind == 1

示例#16

0

显示文件

class DQNLearner(BaseQLearner):
    def __init__(self, learner_parms, network, batch_size=32, testing=False):
        # set required parameters
        learner_parms.required([
            'skip_frame', 'egreedy_policy', 'dataset_shape',
            'max_dataset_size', 'phi_length', 'minimum_replay_size',
            'minibatch_size'
        ])

        # initialize action handler
        rand_vals = learner_parms.get(
            'egreedy_policy')  # starting at 1 anneal eGreedy policy to 0.1
        self.action_handler = ActionHandler(0, rand_vals)

        # set cnn to passed in network
        self.cnn = network

        # initialize experience replay
        dataset_shape = learner_parms.get('dataset_shape')
        self.exp_replay = DataSet(
            dataset_shape['width'],
            dataset_shape['height'],
            max_steps=learner_parms.get('max_dataset_size'),
            phi_length=learner_parms.get('phi_length'))
        self.minimum_replay_size = learner_parms.get('minimum_replay_size')

        # initialize other vars
        self.batch_size = batch_size
        self.skip_frame = learner_parms.get('skip_frame')
        self.step_count = 0

        self.testing = testing

    def run_epoch(self, environment, epoch_step_count=50000):
        episode_rewards = list()
        start_step_count = self.step_count
        while (self.step_count - start_step_count) < epoch_step_count:
            episode_rewards.append(self.run_episode(environment))
        return episode_rewards

    def get_action(self, state):
        """
        Gets an action for the current state. First queries action_handler to see
        if we should execute a random action. If random action, then don't send to gpu
        """
        # check if doing random action
        random, action = self.action_handler.get_random()
        if not random:
            # NOTICE: we already check random above
            cnn_action_values = self.get_action_values(
                self.exp_replay.phi(state).reshape(
                    (1, self.skip_frame, 84, 84)))
            return self.action_handler.get_action(cnn_action_values,
                                                  random=False)
        return action

    def update(self, state, action, reward, state_tp1, terminal):
        """
        Adds experience to memory, runs a minibatch and anneals the random policy
        """
        reward = np.clip(reward, -1, 1)
        self.exp_replay.add_sample(state, action, reward, terminal)

        if not self.testing:
            self.run_minibatch()

        # anneal action handler
        self.step_count += 1
        self.action_handler.anneal_to(self.step_count)

    def run_minibatch(self):
        # generate minibatch data
        if self.exp_replay.size > self.minimum_replay_size:
            states, actions, rewards, state_tp1s, terminal = self.exp_replay.random_batch(
                self.batch_size)
            self.cnn.train(states, actions, rewards, state_tp1s, terminal)

    def get_action_values(self, processed_screens):
        return self.cnn.get_output(processed_screens)

    def get_status(self):
        return 'Step Count: {0}, Current Rand Val: {1}'.format(
            self.step_count, self.action_handler.curr_rand_val)

    def set_action_num(self, num_actions):
        self.action_handler.num_actions = num_actions

    def save(self, filename):
        self.cnn.save(filename)

示例#17

0

显示文件

文件： test_actionhandler.py 项目： Islandman93/reinforcepy

def test_action_vect_to_game_action(action_handler: ActionHandler):
    game_action = action_handler.action_vect_to_game_action([0, 0, 1, 0], random=False)
    assert isinstance(game_action, np.integer), "expected int got {}".format(type(game_action))
    assert game_action == 4

示例#18

0

显示文件

文件： dqn.py 项目： Islandman93/reinforcepy

class DQNLearner(BaseQLearner):
    def __init__(self, learner_parms, network, batch_size=32, testing=False):
        # set required parameters
        learner_parms.required(['skip_frame', 'egreedy_policy', 'dataset_shape', 'max_dataset_size', 'phi_length',
                                'minimum_replay_size', 'minibatch_size'])

        # initialize action handler
        rand_vals = learner_parms.get('egreedy_policy')  # starting at 1 anneal eGreedy policy to 0.1
        self.action_handler = ActionHandler(0, rand_vals)

        # set cnn to passed in network
        self.cnn = network

        # initialize experience replay
        dataset_shape = learner_parms.get('dataset_shape')
        self.exp_replay = DataSet(dataset_shape['width'], dataset_shape['height'],
                                  max_steps=learner_parms.get('max_dataset_size'),
                                  phi_length=learner_parms.get('phi_length'))
        self.minimum_replay_size = learner_parms.get('minimum_replay_size')

        # initialize other vars
        self.batch_size = batch_size
        self.skip_frame = learner_parms.get('skip_frame')
        self.step_count = 0

        self.testing = testing

    def run_epoch(self, environment, epoch_step_count=50000):
        episode_rewards = list()
        start_step_count = self.step_count
        while (self.step_count - start_step_count) < epoch_step_count:
            episode_rewards.append(self.run_episode(environment))
        return episode_rewards

    def get_action(self, state):
        """
        Gets an action for the current state. First queries action_handler to see
        if we should execute a random action. If random action, then don't send to gpu
        """
        # check if doing random action
        random, action = self.action_handler.get_random()
        if not random:
            # NOTICE: we already check random above
            cnn_action_values = self.get_action_values(self.exp_replay.phi(state).reshape((1, self.skip_frame, 84, 84)))
            return self.action_handler.get_action(cnn_action_values, random=False)
        return action

    def update(self, state, action, reward, state_tp1, terminal):
        """
        Adds experience to memory, runs a minibatch and anneals the random policy
        """
        reward = np.clip(reward, -1, 1)
        self.exp_replay.add_sample(state, action, reward, terminal)

        if not self.testing:
            self.run_minibatch()

        # anneal action handler
        self.step_count += 1
        self.action_handler.anneal_to(self.step_count)

    def run_minibatch(self):
        # generate minibatch data
        if self.exp_replay.size > self.minimum_replay_size:
            states, actions, rewards, state_tp1s, terminal = self.exp_replay.random_batch(self.batch_size)
            self.cnn.train(states, actions, rewards, state_tp1s, terminal)

    def get_action_values(self, processed_screens):
        return self.cnn.get_output(processed_screens)

    def get_status(self):
        return 'Step Count: {0}, Current Rand Val: {1}'.format(self.step_count, self.action_handler.curr_rand_val)

    def set_action_num(self, num_actions):
        self.action_handler.num_actions = num_actions

    def save(self, filename):
        self.cnn.save(filename)

示例#19

0

显示文件

文件： test_actionhandler.py 项目： Islandman93/reinforcepy

def test_rand_vals():
    # just test to make sure rand vals doesn't fail
    action_handler = ActionHandler((1, 0.1, 2), ActionPolicy.randVals, [0, 2, 4, 6])
    action_handler.get_action([0, 0, 0, 0])

示例#20

0

显示文件

文件： test_actionhandler.py 项目： Islandman93/reinforcepy

def test_anneal(action_handler: ActionHandler):
    action_handler.anneal()
    action_handler.anneal()
    action_handler.anneal()
    assert action_handler.curr_rand_val == 0
    assert action_handler.curr_rand_val == action_handler.lowest_rand_val

示例#21

0

显示文件

文件： test_actionhandler.py 项目： tonylibing/reinforcepy

def action_handler():
    act = ActionHandler((1, 0, 3))
    return act

示例#22

0

显示文件

文件： test_actionhandler.py 项目： tonylibing/reinforcepy

def test_anneal(action_handler: ActionHandler):
    action_handler.anneal()
    action_handler.anneal()
    action_handler.anneal()
    assert action_handler.curr_rand_val == 0
    assert action_handler.curr_rand_val == action_handler.lowest_rand_val

示例#23

0

显示文件

文件： test_actionhandler.py 项目： Islandman93/reinforcepy

def test_get_action(action_handler: ActionHandler):
    action_ind = action_handler.get_action([1, 0, 0, 0], random=False)
    assert isinstance(action_ind, np.integer), "expected int got {}".format(type(action_ind))
    assert action_ind == 0

    action_handler.get_action([1, 0, 0, 0])  # just make sure random doesn't fail

示例#24

0

显示文件

文件： test_actionhandler.py 项目： tonylibing/reinforcepy

def test_game_action_to_action_ind(action_handler: ActionHandler):
    action_ind = action_handler.game_action_to_action_ind(2)
    assert isinstance(action_ind, np.integer), "expected int got {}".format(
        type(action_ind))
    assert action_ind == 1

示例#25

0

显示文件

文件： test_actionhandler.py 项目： tonylibing/reinforcepy

def test_rand_vals():
    # just test to make sure rand vals doesn't fail
    action_handler = ActionHandler((1, 0.1, 2), ActionPolicy.randVals,
                                   [0, 2, 4, 6])
    action_handler.get_action([0, 0, 0, 0])

示例#26

0

显示文件

文件： test_actionhandler.py 项目： tonylibing/reinforcepy

def test_action_vect_to_game_action(action_handler: ActionHandler):
    game_action = action_handler.action_vect_to_game_action([0, 0, 1, 0],
                                                            random=False)
    assert isinstance(game_action, np.integer), "expected int got {}".format(
        type(game_action))
    assert game_action == 4

示例#27

0

显示文件

文件： base_thread_learner.py 项目： Islandman93/reinforcepy

class BaseThreadLearner(threading.Thread):
    def __init__(self, environment, network, global_dict, phi_length=4,
                 async_update_step=5, reward_clip_vals=[-1, 1], random_policy=True, epsilon_annealing_start=1,
                 epsilon_annealing_choices=[0.1, 0.01, 0.5], epsilon_annealing_probabilities=[0.4, 0.3, 0.3],
                 epsilon_annealing_steps=1000000, global_epsilon_annealing=True,
                 testing=False):
        super().__init__()

        # If doing a random policy (E-greedy)
        self.random_policy = random_policy
        if random_policy:
            # initialize action handler, ending E-greedy is either 0.1, 0.01, 0.5 with probability 0.4, 0.3, 0.3
            end_rand = np.random.choice(epsilon_annealing_choices, p=epsilon_annealing_probabilities)
            rand_vals = (epsilon_annealing_start, end_rand, epsilon_annealing_steps)
            self.action_handler = ActionHandler(environment.get_num_actions(), rand_vals)  # we set num actions later
        self.step_count = 0
        self.environment = environment
        self.reward_clip_vals = reward_clip_vals

        # network stuff
        self.network = network

        self.phi_length = phi_length
        self.frame_buffer = FrameBuffer([1, phi_length] + environment.get_state_shape())

        self.async_update_step = async_update_step
        self.global_dict = global_dict
        self.global_epsilon_annealing = global_epsilon_annealing

        self.minibatch_vars = {}
        self.reset_minibatch()

        self.testing = testing

    def reset(self):
        self.reset_minibatch()
        self.frame_buffer.reset()

        # initialize the buffer with states
        # TODO: add random starts here
        state = self.environment.get_state()
        for _ in range(self.phi_length):
            self.frame_buffer.add_state_to_buffer(state)

    def run(self):
        while not self.global_dict['done']:
            reward = self.run_episode(self.environment)
            self.global_dict['add_reward'](reward)
            curr_rand_val = ''
            if self.random_policy:
                curr_rand_val = 'Curr Rand Val: {0}'.format(self.action_handler.curr_rand_val)
            print(self, 'Episode reward:', reward, 'Steps:', self.environment.curr_step_count,
                  'Step count:', self.step_count, curr_rand_val)

    def update(self, *args, **kwargs):
        raise NotImplementedError('Base onestep learner does not implement update.')

    def anneal_random_policy(self):
        if self.random_policy:
            # anneal action handler
            anneal_step = self.global_dict['counter'] if self.global_epsilon_annealing else self.step_count
            self.action_handler.anneal_to(anneal_step)

    def get_action(self, state):
        """
        Gets an action for the current state. First queries action_handler to see
        if we should execute a random action. If random action, then don't send to gpu
        """
        if self.random_policy:
            # check if doing random action
            random, action = self.action_handler.get_random()
            if not random:
                return self.network.get_output(self.frame_buffer.get_buffer_with(state))
            return action
        else:
            return self.network.get_output(self.frame_buffer.get_buffer_with(state))

    def reset_minibatch(self):
        pass