def test_set_legal_actions(action_handler: ActionHandler): # test to make sure action raises error on matrix input with pytest.raises(AssertionError): action_handler.set_legal_actions([[0, 2, 4, 6]]) action_handler.set_legal_actions([0, 2, 4, 6]) assert action_handler.numActions == 4
def __init__(self, learner_parms, network, batch_size=32, testing=False): # set required parameters learner_parms.required([ 'skip_frame', 'egreedy_policy', 'dataset_shape', 'max_dataset_size', 'phi_length', 'minimum_replay_size', 'minibatch_size' ]) # initialize action handler rand_vals = learner_parms.get( 'egreedy_policy') # starting at 1 anneal eGreedy policy to 0.1 self.action_handler = ActionHandler(0, rand_vals) # set cnn to passed in network self.cnn = network # initialize experience replay dataset_shape = learner_parms.get('dataset_shape') self.exp_replay = DataSet( dataset_shape['width'], dataset_shape['height'], max_steps=learner_parms.get('max_dataset_size'), phi_length=learner_parms.get('phi_length')) self.minimum_replay_size = learner_parms.get('minimum_replay_size') # initialize other vars self.batch_size = batch_size self.skip_frame = learner_parms.get('skip_frame') self.step_count = 0 self.testing = testing
def test_get_action(action_handler: ActionHandler): action_ind = action_handler.get_action([1, 0, 0, 0], random=False) assert isinstance(action_ind, np.integer), "expected int got {}".format( type(action_ind)) assert action_ind == 0 action_handler.get_action([1, 0, 0, 0]) # just make sure random doesn't fail
def test_get_random(action_handler: ActionHandler): # reset curr rand val action_handler.curr_rand_val = 1 # should be random action random, action = action_handler.get_random() assert random is True assert action in [0, 2, 4, 6] # test shouldn't be random action_handler.curr_rand_val = 0 random, action = action_handler.get_random() assert random is False assert action is None
def __init__(self, learner_parms, network, batch_size=32, testing=False): # set required parameters learner_parms.required(['skip_frame', 'egreedy_policy', 'dataset_shape', 'max_dataset_size', 'phi_length', 'minimum_replay_size', 'minibatch_size']) # initialize action handler rand_vals = learner_parms.get('egreedy_policy') # starting at 1 anneal eGreedy policy to 0.1 self.action_handler = ActionHandler(0, rand_vals) # set cnn to passed in network self.cnn = network # initialize experience replay dataset_shape = learner_parms.get('dataset_shape') self.exp_replay = DataSet(dataset_shape['width'], dataset_shape['height'], max_steps=learner_parms.get('max_dataset_size'), phi_length=learner_parms.get('phi_length')) self.minimum_replay_size = learner_parms.get('minimum_replay_size') # initialize other vars self.batch_size = batch_size self.skip_frame = learner_parms.get('skip_frame') self.step_count = 0 self.testing = testing
def __init__(self, environment, network, global_dict, phi_length=4, async_update_step=5, reward_clip_vals=[-1, 1], random_policy=True, epsilon_annealing_start=1, epsilon_annealing_choices=[0.1, 0.01, 0.5], epsilon_annealing_probabilities=[0.4, 0.3, 0.3], epsilon_annealing_steps=1000000, global_epsilon_annealing=True, testing=False): super().__init__() # If doing a random policy (E-greedy) self.random_policy = random_policy if random_policy: # initialize action handler, ending E-greedy is either 0.1, 0.01, 0.5 with probability 0.4, 0.3, 0.3 end_rand = np.random.choice(epsilon_annealing_choices, p=epsilon_annealing_probabilities) rand_vals = (epsilon_annealing_start, end_rand, epsilon_annealing_steps) self.action_handler = ActionHandler(environment.get_num_actions(), rand_vals) # we set num actions later self.step_count = 0 self.environment = environment self.reward_clip_vals = reward_clip_vals # network stuff self.network = network self.phi_length = phi_length self.frame_buffer = FrameBuffer([1, phi_length] + environment.get_state_shape()) self.async_update_step = async_update_step self.global_dict = global_dict self.global_epsilon_annealing = global_epsilon_annealing self.minibatch_vars = {} self.reset_minibatch() self.testing = testing
def __init__(self, num_actions, initial_cnn_values, cnn_partial, pipe, skip_frame=4, phi_length=4, async_update_step=5): super().__init__(pipe) # A3C doesn't have an EGreedy exploration policy so we set the random values to 0 self.action_handler = ActionHandler((0, 0, 2)) # initialize network self.cnn = cnn_partial() self.cnn.set_parameters(initial_cnn_values) self.frame_buffer = np.zeros((1, phi_length, 84, 84), dtype=np.float32) self.skip_frame = skip_frame self.phi_length = phi_length self.loss_list = list() self.async_update_step = async_update_step
def __init__(self, environment, network, global_dict, phi_length=4, async_update_step=5, reward_clip_vals=[-1, 1], random_policy=True, epsilon_annealing_start=1, epsilon_annealing_choices=[0.1, 0.01, 0.5], epsilon_annealing_probabilities=[0.4, 0.3, 0.3], epsilon_annealing_steps=1000000, global_epsilon_annealing=True, testing=False): super().__init__() # If doing a random policy (E-greedy) self.random_policy = random_policy if random_policy: # initialize action handler, ending E-greedy is either 0.1, 0.01, 0.5 with probability 0.4, 0.3, 0.3 end_rand = np.random.choice(epsilon_annealing_choices, p=epsilon_annealing_probabilities) rand_vals = (epsilon_annealing_start, end_rand, epsilon_annealing_steps) self.action_handler = ActionHandler( environment.get_num_actions(), rand_vals) # we set num actions later self.step_count = 0 self.environment = environment self.reward_clip_vals = reward_clip_vals # network stuff self.network = network self.phi_length = phi_length self.frame_buffer = FrameBuffer([1, phi_length] + environment.get_state_shape()) self.async_update_step = async_update_step self.global_dict = global_dict self.global_epsilon_annealing = global_epsilon_annealing self.minibatch_vars = {} self.reset_minibatch() self.testing = testing
class AsyncProcessA3CLearner(AsyncProcessClient): def __init__(self, num_actions, initial_cnn_values, cnn_partial, pipe, skip_frame=4, phi_length=4, async_update_step=5): super().__init__(pipe) # A3C doesn't have an EGreedy exploration policy so we set the random values to 0 self.action_handler = ActionHandler((0, 0, 2)) # initialize network self.cnn = cnn_partial() self.cnn.set_parameters(initial_cnn_values) self.frame_buffer = np.zeros((1, phi_length, 84, 84), dtype=np.float32) self.skip_frame = skip_frame self.phi_length = phi_length self.loss_list = list() self.async_update_step = async_update_step def add_state_to_buffer(self, state): self.frame_buffer[0, 0:self.phi_length-1] = self.frame_buffer[0, 1:self.phi_length] self.frame_buffer[0, self.phi_length-1] = state def frame_buffer_with(self, state): empty_buffer = np.zeros((1, self.phi_length, 84, 84), dtype=np.float32) empty_buffer[0, 0:self.phi_length-1] = self.frame_buffer[0, 1:self.phi_length] empty_buffer[0, self.phi_length-1] = state return empty_buffer def get_action(self, frame_buffer): return self.cnn.get_policy_output(frame_buffer)[0] def get_game_action(self, frame_buffer): action = self.get_action(frame_buffer) return self.action_handler.action_vect_to_game_action(action, random=False) def set_legal_actions(self, legal_actions): self.action_handler.set_legal_actions(legal_actions)
def test_anneal_to(action_handler: ActionHandler): # zero should be highest rand val action_handler.anneal_to(0) assert action_handler.curr_rand_val == 1 # one should be in the middle action_handler.anneal_to(1) assert action_handler.curr_rand_val == 0.5 # 2 and greater should be lowest action_handler.anneal_to(2) assert action_handler.curr_rand_val == 0 assert action_handler.curr_rand_val == action_handler.lowest_rand_val action_handler.anneal_to(999) assert action_handler.curr_rand_val == 0 assert action_handler.curr_rand_val == action_handler.lowest_rand_val
class BaseThreadLearner(threading.Thread): def __init__(self, environment, network, global_dict, phi_length=4, async_update_step=5, reward_clip_vals=[-1, 1], random_policy=True, epsilon_annealing_start=1, epsilon_annealing_choices=[0.1, 0.01, 0.5], epsilon_annealing_probabilities=[0.4, 0.3, 0.3], epsilon_annealing_steps=1000000, global_epsilon_annealing=True, testing=False): super().__init__() # If doing a random policy (E-greedy) self.random_policy = random_policy if random_policy: # initialize action handler, ending E-greedy is either 0.1, 0.01, 0.5 with probability 0.4, 0.3, 0.3 end_rand = np.random.choice(epsilon_annealing_choices, p=epsilon_annealing_probabilities) rand_vals = (epsilon_annealing_start, end_rand, epsilon_annealing_steps) self.action_handler = ActionHandler( environment.get_num_actions(), rand_vals) # we set num actions later self.step_count = 0 self.environment = environment self.reward_clip_vals = reward_clip_vals # network stuff self.network = network self.phi_length = phi_length self.frame_buffer = FrameBuffer([1, phi_length] + environment.get_state_shape()) self.async_update_step = async_update_step self.global_dict = global_dict self.global_epsilon_annealing = global_epsilon_annealing self.minibatch_vars = {} self.reset_minibatch() self.testing = testing def reset(self): self.reset_minibatch() self.frame_buffer.reset() # initialize the buffer with states # TODO: add random starts here state = self.environment.get_state() for _ in range(self.phi_length): self.frame_buffer.add_state_to_buffer(state) def run(self): while not self.global_dict['done']: reward = self.run_episode(self.environment) self.global_dict['add_reward'](reward) curr_rand_val = '' if self.random_policy: curr_rand_val = 'Curr Rand Val: {0}'.format( self.action_handler.curr_rand_val) print(self, 'Episode reward:', reward, 'Steps:', self.environment.curr_step_count, 'Step count:', self.step_count, curr_rand_val) def update(self, *args, **kwargs): raise NotImplementedError( 'Base onestep learner does not implement update.') def anneal_random_policy(self): if self.random_policy: # anneal action handler anneal_step = self.global_dict[ 'counter'] if self.global_epsilon_annealing else self.step_count self.action_handler.anneal_to(anneal_step) def get_action(self, state): """ Gets an action for the current state. First queries action_handler to see if we should execute a random action. If random action, then don't send to gpu """ if self.random_policy: # check if doing random action random, action = self.action_handler.get_random() if not random: return self.network.get_output( self.frame_buffer.get_buffer_with(state)) return action else: return self.network.get_output( self.frame_buffer.get_buffer_with(state)) def reset_minibatch(self): pass
def test_game_action_to_action_ind(action_handler: ActionHandler): action_ind = action_handler.game_action_to_action_ind(2) assert isinstance(action_ind, np.integer), "expected int got {}".format(type(action_ind)) assert action_ind == 1
class DQNLearner(BaseQLearner): def __init__(self, learner_parms, network, batch_size=32, testing=False): # set required parameters learner_parms.required([ 'skip_frame', 'egreedy_policy', 'dataset_shape', 'max_dataset_size', 'phi_length', 'minimum_replay_size', 'minibatch_size' ]) # initialize action handler rand_vals = learner_parms.get( 'egreedy_policy') # starting at 1 anneal eGreedy policy to 0.1 self.action_handler = ActionHandler(0, rand_vals) # set cnn to passed in network self.cnn = network # initialize experience replay dataset_shape = learner_parms.get('dataset_shape') self.exp_replay = DataSet( dataset_shape['width'], dataset_shape['height'], max_steps=learner_parms.get('max_dataset_size'), phi_length=learner_parms.get('phi_length')) self.minimum_replay_size = learner_parms.get('minimum_replay_size') # initialize other vars self.batch_size = batch_size self.skip_frame = learner_parms.get('skip_frame') self.step_count = 0 self.testing = testing def run_epoch(self, environment, epoch_step_count=50000): episode_rewards = list() start_step_count = self.step_count while (self.step_count - start_step_count) < epoch_step_count: episode_rewards.append(self.run_episode(environment)) return episode_rewards def get_action(self, state): """ Gets an action for the current state. First queries action_handler to see if we should execute a random action. If random action, then don't send to gpu """ # check if doing random action random, action = self.action_handler.get_random() if not random: # NOTICE: we already check random above cnn_action_values = self.get_action_values( self.exp_replay.phi(state).reshape( (1, self.skip_frame, 84, 84))) return self.action_handler.get_action(cnn_action_values, random=False) return action def update(self, state, action, reward, state_tp1, terminal): """ Adds experience to memory, runs a minibatch and anneals the random policy """ reward = np.clip(reward, -1, 1) self.exp_replay.add_sample(state, action, reward, terminal) if not self.testing: self.run_minibatch() # anneal action handler self.step_count += 1 self.action_handler.anneal_to(self.step_count) def run_minibatch(self): # generate minibatch data if self.exp_replay.size > self.minimum_replay_size: states, actions, rewards, state_tp1s, terminal = self.exp_replay.random_batch( self.batch_size) self.cnn.train(states, actions, rewards, state_tp1s, terminal) def get_action_values(self, processed_screens): return self.cnn.get_output(processed_screens) def get_status(self): return 'Step Count: {0}, Current Rand Val: {1}'.format( self.step_count, self.action_handler.curr_rand_val) def set_action_num(self, num_actions): self.action_handler.num_actions = num_actions def save(self, filename): self.cnn.save(filename)
def test_action_vect_to_game_action(action_handler: ActionHandler): game_action = action_handler.action_vect_to_game_action([0, 0, 1, 0], random=False) assert isinstance(game_action, np.integer), "expected int got {}".format(type(game_action)) assert game_action == 4
class DQNLearner(BaseQLearner): def __init__(self, learner_parms, network, batch_size=32, testing=False): # set required parameters learner_parms.required(['skip_frame', 'egreedy_policy', 'dataset_shape', 'max_dataset_size', 'phi_length', 'minimum_replay_size', 'minibatch_size']) # initialize action handler rand_vals = learner_parms.get('egreedy_policy') # starting at 1 anneal eGreedy policy to 0.1 self.action_handler = ActionHandler(0, rand_vals) # set cnn to passed in network self.cnn = network # initialize experience replay dataset_shape = learner_parms.get('dataset_shape') self.exp_replay = DataSet(dataset_shape['width'], dataset_shape['height'], max_steps=learner_parms.get('max_dataset_size'), phi_length=learner_parms.get('phi_length')) self.minimum_replay_size = learner_parms.get('minimum_replay_size') # initialize other vars self.batch_size = batch_size self.skip_frame = learner_parms.get('skip_frame') self.step_count = 0 self.testing = testing def run_epoch(self, environment, epoch_step_count=50000): episode_rewards = list() start_step_count = self.step_count while (self.step_count - start_step_count) < epoch_step_count: episode_rewards.append(self.run_episode(environment)) return episode_rewards def get_action(self, state): """ Gets an action for the current state. First queries action_handler to see if we should execute a random action. If random action, then don't send to gpu """ # check if doing random action random, action = self.action_handler.get_random() if not random: # NOTICE: we already check random above cnn_action_values = self.get_action_values(self.exp_replay.phi(state).reshape((1, self.skip_frame, 84, 84))) return self.action_handler.get_action(cnn_action_values, random=False) return action def update(self, state, action, reward, state_tp1, terminal): """ Adds experience to memory, runs a minibatch and anneals the random policy """ reward = np.clip(reward, -1, 1) self.exp_replay.add_sample(state, action, reward, terminal) if not self.testing: self.run_minibatch() # anneal action handler self.step_count += 1 self.action_handler.anneal_to(self.step_count) def run_minibatch(self): # generate minibatch data if self.exp_replay.size > self.minimum_replay_size: states, actions, rewards, state_tp1s, terminal = self.exp_replay.random_batch(self.batch_size) self.cnn.train(states, actions, rewards, state_tp1s, terminal) def get_action_values(self, processed_screens): return self.cnn.get_output(processed_screens) def get_status(self): return 'Step Count: {0}, Current Rand Val: {1}'.format(self.step_count, self.action_handler.curr_rand_val) def set_action_num(self, num_actions): self.action_handler.num_actions = num_actions def save(self, filename): self.cnn.save(filename)
def test_rand_vals(): # just test to make sure rand vals doesn't fail action_handler = ActionHandler((1, 0.1, 2), ActionPolicy.randVals, [0, 2, 4, 6]) action_handler.get_action([0, 0, 0, 0])
def test_anneal(action_handler: ActionHandler): action_handler.anneal() action_handler.anneal() action_handler.anneal() assert action_handler.curr_rand_val == 0 assert action_handler.curr_rand_val == action_handler.lowest_rand_val
def action_handler(): act = ActionHandler((1, 0, 3)) return act
def test_get_action(action_handler: ActionHandler): action_ind = action_handler.get_action([1, 0, 0, 0], random=False) assert isinstance(action_ind, np.integer), "expected int got {}".format(type(action_ind)) assert action_ind == 0 action_handler.get_action([1, 0, 0, 0]) # just make sure random doesn't fail
def test_game_action_to_action_ind(action_handler: ActionHandler): action_ind = action_handler.game_action_to_action_ind(2) assert isinstance(action_ind, np.integer), "expected int got {}".format( type(action_ind)) assert action_ind == 1
def test_action_vect_to_game_action(action_handler: ActionHandler): game_action = action_handler.action_vect_to_game_action([0, 0, 1, 0], random=False) assert isinstance(game_action, np.integer), "expected int got {}".format( type(game_action)) assert game_action == 4
class BaseThreadLearner(threading.Thread): def __init__(self, environment, network, global_dict, phi_length=4, async_update_step=5, reward_clip_vals=[-1, 1], random_policy=True, epsilon_annealing_start=1, epsilon_annealing_choices=[0.1, 0.01, 0.5], epsilon_annealing_probabilities=[0.4, 0.3, 0.3], epsilon_annealing_steps=1000000, global_epsilon_annealing=True, testing=False): super().__init__() # If doing a random policy (E-greedy) self.random_policy = random_policy if random_policy: # initialize action handler, ending E-greedy is either 0.1, 0.01, 0.5 with probability 0.4, 0.3, 0.3 end_rand = np.random.choice(epsilon_annealing_choices, p=epsilon_annealing_probabilities) rand_vals = (epsilon_annealing_start, end_rand, epsilon_annealing_steps) self.action_handler = ActionHandler(environment.get_num_actions(), rand_vals) # we set num actions later self.step_count = 0 self.environment = environment self.reward_clip_vals = reward_clip_vals # network stuff self.network = network self.phi_length = phi_length self.frame_buffer = FrameBuffer([1, phi_length] + environment.get_state_shape()) self.async_update_step = async_update_step self.global_dict = global_dict self.global_epsilon_annealing = global_epsilon_annealing self.minibatch_vars = {} self.reset_minibatch() self.testing = testing def reset(self): self.reset_minibatch() self.frame_buffer.reset() # initialize the buffer with states # TODO: add random starts here state = self.environment.get_state() for _ in range(self.phi_length): self.frame_buffer.add_state_to_buffer(state) def run(self): while not self.global_dict['done']: reward = self.run_episode(self.environment) self.global_dict['add_reward'](reward) curr_rand_val = '' if self.random_policy: curr_rand_val = 'Curr Rand Val: {0}'.format(self.action_handler.curr_rand_val) print(self, 'Episode reward:', reward, 'Steps:', self.environment.curr_step_count, 'Step count:', self.step_count, curr_rand_val) def update(self, *args, **kwargs): raise NotImplementedError('Base onestep learner does not implement update.') def anneal_random_policy(self): if self.random_policy: # anneal action handler anneal_step = self.global_dict['counter'] if self.global_epsilon_annealing else self.step_count self.action_handler.anneal_to(anneal_step) def get_action(self, state): """ Gets an action for the current state. First queries action_handler to see if we should execute a random action. If random action, then don't send to gpu """ if self.random_policy: # check if doing random action random, action = self.action_handler.get_random() if not random: return self.network.get_output(self.frame_buffer.get_buffer_with(state)) return action else: return self.network.get_output(self.frame_buffer.get_buffer_with(state)) def reset_minibatch(self): pass