def _run_trial_explore(self, env, trials, current_trial) -> TrialMetrics: logger.debug("** Running trial explore ** ") # Initial conditions steps = 0 raw_state = env.reset() state = self.cfg.environment_adapter.to_genotype(raw_state) action = env.action_space.sample() last_reward = 0 prev_state = Perception.empty() selected_cl = None prev_selected_cl = None done = False while not done: state = Perception(state) match_set = self.population.form_match_set(state) if steps > 0: alp.apply(prev_state, state, selected_cl, self.population) rl.bucket_brigade_update( selected_cl, prev_selected_cl, last_reward) prev_selected_cl = selected_cl # TODO: you can do it better if random.random() < self.cfg.epsilon: selected_cl = random.choice(match_set) else: selected_cl = self._best_cl(match_set) action = selected_cl.action iaction = self.cfg.environment_adapter.to_lcs_action(action) logger.debug("\tExecuting action: [%d]", action) prev_state = Perception(state) raw_state, last_reward, done, _ = env.step(iaction) state = self.cfg.environment_adapter.to_genotype(raw_state) state = Perception(state) if done: alp.apply(prev_state, state, selected_cl, self.population) rl.bucket_brigade_update( selected_cl, prev_selected_cl, last_reward) steps += 1 return TrialMetrics(steps, last_reward)
def test_should_perform_bucket_brigade_update_when_first_step(self): # given prev_cl = None cl = Classifier(0.5, None) # when rl.bucket_brigade_update(cl, prev_cl, 100) # then assert cl.r == 0.5 assert prev_cl is None
def test_should_perform_bucket_brigade_update(self, _r0, reward, _r1): # given prev_cl = Classifier(_r0, None) cl = Classifier(0.5, None) # when rl.bucket_brigade_update(cl, prev_cl, reward) # then assert cl.r == 0.5 assert prev_cl.r == _r1 assert cl.ir is None assert prev_cl.ir is None