def _get_feedback_inner(self, state, action, reward, next_state, finished): state = state.reshape(1, multiply(*state.shape)) next_state = state.reshape(1, multiply(*next_state.shape)) action_number = np.unravel_index( np.ravel_multi_index(action, self.action_shape), (4096, ))[0] self.exp_buffer.add(state, action_number, reward, next_state, finished) if self.number_turns % self._intervall_actions_train == 0 and self.number_turns > 1: self.train_network() if self.number_turns % self._intervall_turns_load == 0 and self.number_turns > 1: self.load_weights_into_target_network()
def _configure_network(self, state_shape: tuple): network = tf.keras.models.Sequential([ Dense(512, activation="relu", input_shape=(multiply(*state_shape), )), #Dense(1024, activation="relu"), #Dense(2048, activation="relu"), #Dense(4096, activation="relu"), Dense(2048, activation="relu"), Dense(self.number_actions, activation="linear")]) self.optimizer = tf.optimizers.Adam(self._learning_rate) return network
def decision(self, state_space: np.ndarray, action_space: ActionSpace): """ triggered by get play turn method of super class. This is the method were the magic should happen that chooses the right action :param state_space: :param action_space: :return: """ # preprocess state space # normalizing state space between zero and one state_space = min_max_scaling(state_space) state_space = state_space.reshape(1, multiply(*state_space.shape)) qvalues = self._get_qvalues([state_space]) decision = self._sample_actions(qvalues, action_space) return decision
def _get_feedback_inner(self, state, action, reward, next_state, finished): state = state.reshape(multiply(*state.shape), ) next_state = state.reshape(multiply(*next_state.shape), ) if self._buffer_action is not None: action_number_buffer = np.unravel_index(np.ravel_multi_index(self._buffer_action, self.action_shape), (4096,))[0] action_number = np.unravel_index(np.ravel_multi_index(action, self.action_shape), (4096,))[0] self.exp_buffer.add(self._buffer_state, action_number_buffer, self._buffer_reward, state, finished, action_number) if self.number_turns % self._intervall_actions_train == 0 and self.number_turns > 1: self.train_network() if self.number_turns % self._intervall_turns_load == 0 and self.number_turns > 1: self.load_weights_into_target_network() self._buffer_action = action self._buffer_state = state self._buffer_reward = reward self._buffer_done = finished # if finished set the buffers to none since we don´t want to mix episodes with each other if finished: # zero as next action number is just a placeholder (we don´t use next action since the episode is finished self.exp_buffer.add(state, action_number, reward, next_state, finished, 0) self._buffer_done, self._buffer_reward, self._buffer_state, self._buffer_action = None, None, None, None
def _configure_network(self, state_shape: tuple): network = tf.keras.models.Sequential([ LSTM(512, activation="relu", input_shape=(1, multiply(*state_shape)), return_sequences=True), #LSTM(1024, activation="relu", return_sequences=True), #LSTM(2048, activation="relu", return_sequences=True), #LSTM(4096, activation="relu", return_sequences=True), #Dense(8012, activation="relu"), #Dense(4096, activation="relu"), Dense(2048, activation="relu"), Flatten(), Dense(self.number_actions, activation="linear") ]) self.optimizer = tf.optimizers.Adam(self._learning_rate) return network
def _configure_network(self, state_shape: tuple): # define network inputs = Input(shape=(1, multiply(*state_shape))) x = LSTM(512, activation="relu", input_shape=(1, 64), return_sequences=True)(inputs) #x = LSTM(1024, activation="relu", return_sequences=True)(x) #x = LSTM(2048, activation="relu", return_sequences=True)(x) #x = Dense(4096, activation="relu")(x) x = Dense(2048, activation="relu")(x) x = Flatten()(x) logits = Dense(self.number_actions, activation="linear")(x) state_value = Dense(1, activation="linear")(x) network = tf.keras.models.Model(inputs=inputs, outputs=[logits, state_value]) self.optimizer = tf.optimizers.Adam(self._learning_rate) return network