def init_table(self) -> dict: """Returns a dict with reward statistics for every combination of oao|ar""" table = {} a_length = self.environment.action_length o_length = self.environment.observation_length if o_length == 0: table[''] = {} for action_idx in range(pow(2, a_length)): action = Utility.get_bitstring_from_decimal( action_idx, a_length) table[''][action] = {} table[''][action][''] = Utility.init_heap(a_length) else: for last_observation_idx in range(pow(2, o_length)): last_observation = Utility.get_bitstring_from_decimal( last_observation_idx, o_length) table[last_observation] = {} for action_idx in range(pow(2, a_length)): action = Utility.get_bitstring_from_decimal( action_idx, a_length) table[last_observation][action] = {} for observation_idx in range(pow(2, o_length)): observation = Utility.get_bitstring_from_decimal( observation_idx, o_length) table[last_observation][action][ observation] = Utility.init_heap(a_length) return table
def init_double_heap(length: int): """initialize heap with 2 actions for pi2forward agent""" reward_statistics = [] for action_one_idx in range(pow(2, length)): action_one = Utility.get_bitstring_from_decimal(action_one_idx, length) for action_two_idx in range(pow(2, length)): action_two = Utility.get_bitstring_from_decimal(action_two_idx, length) # (expected_reward, actions, number_of_times) heapq.heappush(reward_statistics, (2, action_one + action_two, 0)) return reward_statistics