def get_x_y(data_list): interpolator = Interpolator() interpolator.set_u(ACTIONS) x = [] y = [] for data_row in data_list: new_q = data_row["reward"] if not data_row["done"]: new_q += DISCOUNT * np.max(data_row["next_qualities"]) interpolator.set_q(data_row["qualities"]) interpolator.update_function(data_row["action"], new_q) x.append(data_row["state"]) y.append(interpolator.get_q()) return x, y
class OutputVisualizer: def __init__(self, window_name="output"): self.WIDTH = 250 self.DISPLAY_STEERING_MIN = -1 self.DISPLAY_THROTTLE_MIN = -1 self.DISPLAY_STEERING_MAX = 1 self.DISPLAY_THROTTLE_MAX = 1 self.DISPLAY_STEERING_RANGE = self.DISPLAY_STEERING_MAX - self.DISPLAY_STEERING_MIN self.DISPLAY_THROTTLE_RANGE = self.DISPLAY_THROTTLE_MAX - self.DISPLAY_THROTTLE_MIN self.DISPLAY_STEP = 0.1 self.DISPLAY_STEERING_SEGMENT_WIDTH = int( self.WIDTH * self.DISPLAY_STEP / self.DISPLAY_STEERING_RANGE) self.DISPLAY_THROTTLE_SEGMENT_WIDTH = int( self.WIDTH * self.DISPLAY_STEP / self.DISPLAY_THROTTLE_RANGE) self.HUE_RANGE = 60 self.img = None self.window_name = window_name self._clear() self.interpolator = Interpolator() def _clear(self): self.img = np.zeros((self.WIDTH, self.WIDTH, 3), np.uint8) def _iterate(self, output): u = output[:, :2] q = output[:, 2] self.interpolator.set_u(u) self.interpolator.set_q(q) X = [] Y = [] Z = [] for throttle in np.arange(-1, 1.1, 0.1): for steering in np.arange(-1, 1.1, 0.1): X.append(throttle) Y.append(steering) Z.append( self.interpolator.get_quality( np.array([throttle, steering]))) return X, Y, Z def _coord2px(self, value): return int(min(max(value, 0), self.WIDTH - 1)) def _draw_output_2(self, output): u = output[:, :2] q = output[:, 2] x = [action[1] for action in u] y = [action[0] for action in u] x_0 = min(x) y_0 = min(y) x_pixels_per_value = self.WIDTH / (max(x) - min(x)) y_pixels_per_value = self.WIDTH / (max(y) - min(y)) x_loc = [ self._coord2px((x_value - x_0) * x_pixels_per_value) for x_value in x ] y_loc = [ self._coord2px(self.WIDTH - (y_value - y_0) * y_pixels_per_value) for y_value in y ] x_values = sorted(set(x_loc)) y_values = sorted(set(y_loc)) x_start = dict( zip(x_values, [0] + [ self._coord2px((x_values[i + 1] + x_values[i]) / 2) for i in range(len(x_values) - 1) ])) y_start = dict( zip(y_values, [0] + [ self._coord2px((y_values[i + 1] + y_values[i]) / 2) for i in range(len(y_values) - 1) ])) x_stop = dict( zip(x_values, [ self._coord2px((x_values[i + 1] + x_values[i]) / 2) for i in range(len(x_values) - 1) ] + [self.WIDTH - 1])) y_stop = dict( zip(y_values, [ self._coord2px((y_values[i + 1] + y_values[i]) / 2) for i in range(len(y_values) - 1) ] + [self.WIDTH - 1])) q_0 = min(q + [0]) hue_per_value = self.HUE_RANGE / (max(q + [0]) - min(q + [0])) for i in range(len(q)): cv2.rectangle(self.img, (x_start[x_loc[i]], y_start[y_loc[i]]), (x_stop[x_loc[i]], y_stop[y_loc[i]]), color=tuple( map( int, cv2.cvtColor( np.uint8([[[(q[i] - q_0) * hue_per_value, 255, 255]]]), cv2.COLOR_HSV2BGR)[0, 0])), thickness=-1) cv2.circle(self.img, (x_loc[i], y_loc[i]), max(int(self.DISPLAY_THROTTLE_SEGMENT_WIDTH / 5), 1), (0, 0, 0), -1) def _draw_output(self, output): u = output[:, :2] q = output[:, 2] self.interpolator.set_u(u) self.interpolator.set_q(q) X = [] Y = [] Z = [] for throttle in np.arange( self.DISPLAY_THROTTLE_MIN, self.DISPLAY_THROTTLE_MAX + self.DISPLAY_STEP, self.DISPLAY_STEP): y = (self.WIDTH - self.DISPLAY_THROTTLE_SEGMENT_WIDTH) * ( 0.5 - throttle / self.DISPLAY_THROTTLE_RANGE) for steering in np.arange( self.DISPLAY_STEERING_MIN, self.DISPLAY_STEERING_MAX + self.DISPLAY_STEP, self.DISPLAY_STEP): x = (self.WIDTH - self.DISPLAY_STEERING_SEGMENT_WIDTH) * ( 0.5 + steering / self.DISPLAY_STEERING_RANGE) X.append(int(x)) Y.append(int(y)) Z.append( self.interpolator.get_quality( np.array([throttle, steering]))) knots = [] for i in range(len(u)): throttle = u[i][0] steering = u[i][1] y = int(self.WIDTH * (0.5 - throttle / self.DISPLAY_THROTTLE_RANGE)) x = int(self.WIDTH * (0.5 + steering / self.DISPLAY_STEERING_RANGE)) knots.append((x, y)) min_q = min(Z) range_q = max(q) - min_q q_multiplier = self.HUE_RANGE / range_q if math.isnan(q_multiplier) or math.isinf(q_multiplier): q_multiplier = 1 Z = [ min(max(int(q_multiplier * (z - min_q)), 0), self.HUE_RANGE) for z in Z ] for i in range(len(Z)): cv2.rectangle(self.img, (int(min( self.WIDTH - 1, X[i])), int(min(self.WIDTH - 1, Y[i]))), (int( min(self.WIDTH - 1, X[i] + self.DISPLAY_STEERING_SEGMENT_WIDTH)), int( min(self.WIDTH - 1, Y[i] + self.DISPLAY_THROTTLE_SEGMENT_WIDTH))), color=tuple( map( int, cv2.cvtColor(np.uint8([[[Z[i], 255, 255]]]), cv2.COLOR_HSV2BGR)[0, 0])), thickness=-1) for knot in knots: cv2.circle(self.img, knot, max(int(self.DISPLAY_THROTTLE_SEGMENT_WIDTH / 5), 1), (0, 0, 0), -1) def render(self, output): self._clear() self._draw_output_2(output) cv2.imshow(self.window_name, self.img) cv2.waitKey(40)
def train(self, terminal_state): # Start training only if certain number of samples is already saved if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE: return # Calculate Prioritized Experience Replay weights current_states = np.array([transition[0] for transition in self.replay_memory]) future_states = np.array([transition[3] for transition in self.replay_memory]) current_qs = self.model.predict(current_states) future_qs = self.target_model.predict(future_states) p = np.array([abs((reward + DISCOUNT * np.amax(future_qs[index]) if not done else reward) - current_qs[index][ACTIONS.index(action)]) for index, (_, action, reward, _, done) in enumerate(self.replay_memory)]) p = np.interp(p, (p.min(), p.max()), (0, +1)) p /= np.sum(p) # Get a minibatch of random samples from memory replay table minibatch = np.array(self.replay_memory)[np.random.choice(len(self.replay_memory), size=MINIBATCH_SIZE, replace=False, p=p)] # random.sample(self.replay_memory, MINIBATCH_SIZE) # Get current states from minibatch, then query NN model for Q values current_states = np.array([transition[0] for transition in minibatch]) # / 255 current_qs_list = self.model.predict(current_states) # Get future states from minibatch, then query NN model for Q values # When using target network, query it, otherwise main network should be queried new_current_states = np.array([transition[3] for transition in minibatch]) # / 255 future_target_qs_list = self.target_model.predict(new_current_states) future_model_qs_list = self.model.predict(new_current_states) x = [] y = [] interpolator = Interpolator() # Now we need to enumerate our batches for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch): # If not a terminal state, get new q from future states, otherwise set it to 0 # almost like with Q Learning, but we use just part of equation here future_model_qs_at_index = future_model_qs_list[index] future_target_qs_at_index = future_target_qs_list[index] # future_qs = np.reshape(future_model_qs_at_index, OUTPUT_2D_SHAPE) if not done: max_future_q = future_target_qs_at_index[np.argmax(future_model_qs_at_index)] new_q = reward + DISCOUNT * max_future_q else: new_q = reward # Update Q value for given state current_qs_list_at_index = current_qs_list[index] current_qs = np.reshape(current_qs_list_at_index, OUTPUT_2D_SHAPE) current_actions = ACTIONS current_qualities = current_qs interpolator.set_u(current_actions) interpolator.set_q(current_qualities) interpolator.update_function(action, new_q) # current_qs = np.zeros(OUTPUT_2D_SHAPE) # current_qs[:, :2] = interpolator.get_u() current_qs = interpolator.get_q() # [current_actions.index(action)] = [new_q] # # print(current_state) # print(current_qs_list) # print(action) # current_qs[action] = new_q # And append to our training data x.append(current_state) reshaped_current_qs = np.reshape(current_qs, OUTPUT_1D_SHAPE) y.append(reshaped_current_qs) # print("x:", x) # print("y:", y) # Fit on all samples as one batch, log only on terminal state self.model.fit(np.array(x), np.array(y), batch_size=MINIBATCH_SIZE, verbose=0, shuffle=False, callbacks=[self.tensorboard] if terminal_state else None) # Update target network counter every episode if terminal_state: self.target_update_counter += 1 # If counter reaches set value, update target network with weights of main network if self.target_update_counter > UPDATE_TARGET_EVERY: self.target_model.set_weights(self.model.get_weights()) # a = self.model.get_weights() # print(a) self.target_update_counter = 0 self.save_replay_memory()