示例#1
0
 def get_x_y(data_list):
     interpolator = Interpolator()
     interpolator.set_u(ACTIONS)
     x = []
     y = []
     for data_row in data_list:
         new_q = data_row["reward"]
         if not data_row["done"]:
             new_q += DISCOUNT * np.max(data_row["next_qualities"])
         interpolator.set_q(data_row["qualities"])
         interpolator.update_function(data_row["action"], new_q)
         x.append(data_row["state"])
         y.append(interpolator.get_q())
     return x, y
class OutputVisualizer:
    def __init__(self, window_name="output"):
        self.WIDTH = 250

        self.DISPLAY_STEERING_MIN = -1
        self.DISPLAY_THROTTLE_MIN = -1
        self.DISPLAY_STEERING_MAX = 1
        self.DISPLAY_THROTTLE_MAX = 1

        self.DISPLAY_STEERING_RANGE = self.DISPLAY_STEERING_MAX - self.DISPLAY_STEERING_MIN
        self.DISPLAY_THROTTLE_RANGE = self.DISPLAY_THROTTLE_MAX - self.DISPLAY_THROTTLE_MIN

        self.DISPLAY_STEP = 0.1
        self.DISPLAY_STEERING_SEGMENT_WIDTH = int(
            self.WIDTH * self.DISPLAY_STEP / self.DISPLAY_STEERING_RANGE)
        self.DISPLAY_THROTTLE_SEGMENT_WIDTH = int(
            self.WIDTH * self.DISPLAY_STEP / self.DISPLAY_THROTTLE_RANGE)

        self.HUE_RANGE = 60

        self.img = None
        self.window_name = window_name
        self._clear()
        self.interpolator = Interpolator()

    def _clear(self):
        self.img = np.zeros((self.WIDTH, self.WIDTH, 3), np.uint8)

    def _iterate(self, output):
        u = output[:, :2]
        q = output[:, 2]
        self.interpolator.set_u(u)
        self.interpolator.set_q(q)
        X = []
        Y = []
        Z = []
        for throttle in np.arange(-1, 1.1, 0.1):
            for steering in np.arange(-1, 1.1, 0.1):
                X.append(throttle)
                Y.append(steering)
                Z.append(
                    self.interpolator.get_quality(
                        np.array([throttle, steering])))
        return X, Y, Z

    def _coord2px(self, value):
        return int(min(max(value, 0), self.WIDTH - 1))

    def _draw_output_2(self, output):
        u = output[:, :2]
        q = output[:, 2]

        x = [action[1] for action in u]
        y = [action[0] for action in u]

        x_0 = min(x)
        y_0 = min(y)

        x_pixels_per_value = self.WIDTH / (max(x) - min(x))
        y_pixels_per_value = self.WIDTH / (max(y) - min(y))

        x_loc = [
            self._coord2px((x_value - x_0) * x_pixels_per_value)
            for x_value in x
        ]
        y_loc = [
            self._coord2px(self.WIDTH - (y_value - y_0) * y_pixels_per_value)
            for y_value in y
        ]

        x_values = sorted(set(x_loc))
        y_values = sorted(set(y_loc))

        x_start = dict(
            zip(x_values, [0] + [
                self._coord2px((x_values[i + 1] + x_values[i]) / 2)
                for i in range(len(x_values) - 1)
            ]))
        y_start = dict(
            zip(y_values, [0] + [
                self._coord2px((y_values[i + 1] + y_values[i]) / 2)
                for i in range(len(y_values) - 1)
            ]))

        x_stop = dict(
            zip(x_values, [
                self._coord2px((x_values[i + 1] + x_values[i]) / 2)
                for i in range(len(x_values) - 1)
            ] + [self.WIDTH - 1]))
        y_stop = dict(
            zip(y_values, [
                self._coord2px((y_values[i + 1] + y_values[i]) / 2)
                for i in range(len(y_values) - 1)
            ] + [self.WIDTH - 1]))

        q_0 = min(q + [0])

        hue_per_value = self.HUE_RANGE / (max(q + [0]) - min(q + [0]))

        for i in range(len(q)):
            cv2.rectangle(self.img, (x_start[x_loc[i]], y_start[y_loc[i]]),
                          (x_stop[x_loc[i]], y_stop[y_loc[i]]),
                          color=tuple(
                              map(
                                  int,
                                  cv2.cvtColor(
                                      np.uint8([[[(q[i] - q_0) * hue_per_value,
                                                  255, 255]]]),
                                      cv2.COLOR_HSV2BGR)[0, 0])),
                          thickness=-1)

            cv2.circle(self.img, (x_loc[i], y_loc[i]),
                       max(int(self.DISPLAY_THROTTLE_SEGMENT_WIDTH / 5), 1),
                       (0, 0, 0), -1)

    def _draw_output(self, output):
        u = output[:, :2]
        q = output[:, 2]
        self.interpolator.set_u(u)
        self.interpolator.set_q(q)

        X = []
        Y = []
        Z = []

        for throttle in np.arange(
                self.DISPLAY_THROTTLE_MIN,
                self.DISPLAY_THROTTLE_MAX + self.DISPLAY_STEP,
                self.DISPLAY_STEP):
            y = (self.WIDTH - self.DISPLAY_THROTTLE_SEGMENT_WIDTH) * (
                0.5 - throttle / self.DISPLAY_THROTTLE_RANGE)
            for steering in np.arange(
                    self.DISPLAY_STEERING_MIN,
                    self.DISPLAY_STEERING_MAX + self.DISPLAY_STEP,
                    self.DISPLAY_STEP):
                x = (self.WIDTH - self.DISPLAY_STEERING_SEGMENT_WIDTH) * (
                    0.5 + steering / self.DISPLAY_STEERING_RANGE)

                X.append(int(x))
                Y.append(int(y))
                Z.append(
                    self.interpolator.get_quality(
                        np.array([throttle, steering])))

        knots = []
        for i in range(len(u)):
            throttle = u[i][0]
            steering = u[i][1]
            y = int(self.WIDTH *
                    (0.5 - throttle / self.DISPLAY_THROTTLE_RANGE))
            x = int(self.WIDTH *
                    (0.5 + steering / self.DISPLAY_STEERING_RANGE))
            knots.append((x, y))

        min_q = min(Z)
        range_q = max(q) - min_q

        q_multiplier = self.HUE_RANGE / range_q
        if math.isnan(q_multiplier) or math.isinf(q_multiplier):
            q_multiplier = 1

        Z = [
            min(max(int(q_multiplier * (z - min_q)), 0), self.HUE_RANGE)
            for z in Z
        ]

        for i in range(len(Z)):
            cv2.rectangle(self.img, (int(min(
                self.WIDTH - 1, X[i])), int(min(self.WIDTH - 1, Y[i]))),
                          (int(
                              min(self.WIDTH - 1,
                                  X[i] + self.DISPLAY_STEERING_SEGMENT_WIDTH)),
                           int(
                               min(self.WIDTH - 1, Y[i] +
                                   self.DISPLAY_THROTTLE_SEGMENT_WIDTH))),
                          color=tuple(
                              map(
                                  int,
                                  cv2.cvtColor(np.uint8([[[Z[i], 255, 255]]]),
                                               cv2.COLOR_HSV2BGR)[0, 0])),
                          thickness=-1)

        for knot in knots:
            cv2.circle(self.img, knot,
                       max(int(self.DISPLAY_THROTTLE_SEGMENT_WIDTH / 5), 1),
                       (0, 0, 0), -1)

    def render(self, output):
        self._clear()
        self._draw_output_2(output)
        cv2.imshow(self.window_name, self.img)
        cv2.waitKey(40)
示例#3
0
    def train(self, terminal_state):
        # Start training only if certain number of samples is already saved
        if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
            return

        # Calculate Prioritized Experience Replay weights
        current_states = np.array([transition[0] for transition in self.replay_memory])
        future_states = np.array([transition[3] for transition in self.replay_memory])
        current_qs = self.model.predict(current_states)
        future_qs = self.target_model.predict(future_states)
        p = np.array([abs((reward + DISCOUNT * np.amax(future_qs[index]) if not done else reward)
                          - current_qs[index][ACTIONS.index(action)])
                      for index, (_, action, reward, _, done) in enumerate(self.replay_memory)])
        p = np.interp(p, (p.min(), p.max()), (0, +1))
        p /= np.sum(p)

        # Get a minibatch of random samples from memory replay table
        minibatch = np.array(self.replay_memory)[np.random.choice(len(self.replay_memory),
                                                                  size=MINIBATCH_SIZE,
                                                                  replace=False,
                                                                  p=p)]  # random.sample(self.replay_memory, MINIBATCH_SIZE)

        # Get current states from minibatch, then query NN model for Q values
        current_states = np.array([transition[0] for transition in minibatch])  # / 255
        current_qs_list = self.model.predict(current_states)

        # Get future states from minibatch, then query NN model for Q values
        # When using target network, query it, otherwise main network should be queried
        new_current_states = np.array([transition[3] for transition in minibatch])  # / 255
        future_target_qs_list = self.target_model.predict(new_current_states)
        future_model_qs_list = self.model.predict(new_current_states)

        x = []
        y = []
        interpolator = Interpolator()

        # Now we need to enumerate our batches
        for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):

            # If not a terminal state, get new q from future states, otherwise set it to 0
            # almost like with Q Learning, but we use just part of equation here
            future_model_qs_at_index = future_model_qs_list[index]
            future_target_qs_at_index = future_target_qs_list[index]
            # future_qs = np.reshape(future_model_qs_at_index, OUTPUT_2D_SHAPE)
            if not done:
                max_future_q = future_target_qs_at_index[np.argmax(future_model_qs_at_index)]
                new_q = reward + DISCOUNT * max_future_q
            else:
                new_q = reward

            # Update Q value for given state
            current_qs_list_at_index = current_qs_list[index]
            current_qs = np.reshape(current_qs_list_at_index, OUTPUT_2D_SHAPE)
            current_actions = ACTIONS
            current_qualities = current_qs

            interpolator.set_u(current_actions)
            interpolator.set_q(current_qualities)
            interpolator.update_function(action, new_q)
            # current_qs = np.zeros(OUTPUT_2D_SHAPE)
            # current_qs[:, :2] = interpolator.get_u()
            current_qs = interpolator.get_q()  # [current_actions.index(action)] = [new_q]  #

            # print(current_state)
            # print(current_qs_list)
            # print(action)
            # current_qs[action] = new_q

            # And append to our training data
            x.append(current_state)
            reshaped_current_qs = np.reshape(current_qs, OUTPUT_1D_SHAPE)
            y.append(reshaped_current_qs)

        # print("x:", x)
        # print("y:", y)
        # Fit on all samples as one batch, log only on terminal state
        self.model.fit(np.array(x), np.array(y), batch_size=MINIBATCH_SIZE, verbose=0, shuffle=False,
                       callbacks=[self.tensorboard] if terminal_state else None)
        # Update target network counter every episode
        if terminal_state:
            self.target_update_counter += 1

        # If counter reaches set value, update target network with weights of main network
        if self.target_update_counter > UPDATE_TARGET_EVERY:
            self.target_model.set_weights(self.model.get_weights())
            # a = self.model.get_weights()
            # print(a)
            self.target_update_counter = 0
            self.save_replay_memory()