def play_right_add(self, verbose=False):
        # note this is a right version of game play, which also add input and action
        prev_state = None
        prev_action = None
        prev_value = None
        prev_add_state = None
        prev_map_state = None
        show_image = False

        self.safe_action(C._NO_OP, 0, [])
        self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
        self._gases = U.find_initial_gases(self.obs)

        self.dummy_add_state = np.zeros(1)
        self.dummy_map_state = np.zeros([1, 1, 1])

        simulate_seconds = 0
        feature_dict = U.edge_state()
        previous_match = -1

        while True:

            self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
            if self.policy_flag and (not self.is_end):
                # get the state
                state = self.mapping_source_to_mini_by_rule(
                    self.get_the_input_right(self.obs))

                if self.image_debug and verbose and self.step % C.time_wait(
                        self.image_wait_secs) == 0:
                    show_image = True
                else:
                    show_image = False

                if verbose:
                    print('show_image:', show_image)
                map_state = U.get_small_simple_map_data(
                    self.obs, show_image, show_image)

                if verbose:
                    print('map_state.shape:', map_state.shape)
                add_state = self.get_add_state(
                    self.get_the_input_right(self.obs))

                # get the action and value accoding to state
                #print("add_state:", add_state)
                if self.ob_space_add == 0:
                    add_state = self.dummy_add_state
                    map_state = self.dummy_map_state = np.zeros([1, 1, 1])
                action, action_probs, value = self.net.policy.get_act_action_probs(
                    state, add_state, map_state, verbose=verbose)

                # if this is not the fisrt state, store things to buffer
                if prev_state is not None:
                    # try reward = self.obs.reward
                    reward = self.obs.reward
                    if verbose:
                        print(prev_state, prev_add_state, prev_action, state,
                              reward, prev_value, value)
                    self.local_buffer.append_more_more(
                        prev_state, prev_add_state, prev_map_state,
                        prev_action, state, reward, prev_value, value)

                self.mini_step(action)
                simulate_seconds += self.policy_wait_secs
                # the evn step to new states

                prev_state = state
                prev_action = action
                prev_value = value
                prev_add_state = add_state
                prev_map_state = map_state

                self.policy_flag = False

            if self.is_end:
                # get the last state and reward
                # get the state
                state = self.mapping_source_to_mini_by_rule(
                    self.get_the_input_right(self.obs))
                map_state = U.get_small_simple_map_data(self.obs)
                add_state = self.get_add_state(
                    self.get_the_input_right(self.obs))

                if self.ob_space_add == 0:
                    add_state = self.dummy_add_state
                    map_state = self.dummy_map_state = np.zeros([1, 1, 1])

                value = self.net.policy.get_values(state, add_state, map_state)
                # the value of the last state is defined somewhat different
                value = self.get_values_right(value)

                # if this is not the fisrt state, store things to buffer
                if prev_state is not None:
                    reward = self.obs.reward
                    if verbose:
                        print(prev_state, prev_add_state, prev_action, state,
                              reward, prev_value, value)
                    self.local_buffer.append_more_more(
                        prev_state, prev_add_state, prev_map_state,
                        prev_action, state, reward, prev_value, value)
                break

        if self.rl_training:
            #print(self.local_buffer.values)
            #print(self.local_buffer.values_next)
            #print(self.local_buffer.rewards)
            self.global_buffer.add(self.local_buffer)
            print("add map bn:")
            print("add %d buffer!" % (len(self.local_buffer.rewards)))
示例#2
0
    def set_flag(self):
        if self.step % C.time_wait(self.strategy_wait_secs) == 1:
            self.strategy_flag = True

        if self.step % C.time_wait(self.policy_wait_secs) == 1:
            self.policy_flag = True
示例#3
0
    def play_right_add(self, verbose=False):
        # note this is a right version of game play, which also add input and action
        prev_state = None
        prev_action = None
        prev_value = None
        prev_add_state = None
        prev_map_state = None
        show_image = False

        self.safe_action(C._NO_OP, 0, [])
        self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
        self._gases = U.find_initial_gases(self.obs)

        self.dummy_add_state = np.zeros(1)
        self.dummy_map_state = np.zeros([1, 1, 1])

        simulate_seconds = 0
        feature_dict = U.edge_state()
        previous_match = -1

        while True:

            self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
            if self.policy_flag and (not self.is_end):
                # get the state
                state = self.mapping_source_to_mini_by_rule(
                    self.get_the_input_right(self.obs))

                if self.image_debug and verbose and self.step % C.time_wait(
                        self.image_wait_secs) == 0:
                    show_image = True
                else:
                    show_image = False

                if verbose:
                    print('show_image:', show_image)
                map_state = U.get_small_simple_map_data(
                    self.obs, show_image, show_image)

                if verbose:
                    print('map_state.shape:', map_state.shape)
                add_state = self.get_add_state(
                    self.get_the_input_right(self.obs))

                # get the action and value accoding to state
                #print("add_state:", add_state)
                if self.ob_space_add == 0:
                    add_state = self.dummy_add_state
                    map_state = self.dummy_map_state = np.zeros([1, 1, 1])
                action, action_probs, value = self.net.policy.get_act_action_probs(
                    state, add_state, map_state, verbose=verbose)

                if self.probe_debug and verbose and self.step % C.time_wait(
                        self.prove_save_wait_secs) == 0:
                    save_prob = True
                else:
                    save_prob = False

                if self.action_prob_debug and verbose:
                    print('self.step:', self.step)
                    print(self.prob_show_wait_seconds)
                    print('simulate_seconds:', simulate_seconds)

                    use_TG = True
                    if use_TG:
                        bar_type = 'TG'
                        max_y = 0.3
                        color = 'b'
                    else:
                        bar_type = 'RG'
                        max_y = 0.5
                        color = 'r'

                    bar_name = bar_type + '_' + str(simulate_seconds)
                    if True and simulate_seconds in self.prob_show_wait_seconds:
                        pprint.pprint(action_probs)
                        U.show_prob_dist(action_probs,
                                         show=True,
                                         color=color,
                                         max_y=max_y,
                                         action_num=self.action_num,
                                         save=False,
                                         name=bar_name,
                                         count=0)

                if self.edge_state_debug and verbose:
                    match_list = U.calculate_state_mapping(state, feature_dict)
                    print('state:', state)
                    print('match_list:', match_list)

                    for i, match in enumerate(match_list):
                        if match and i != previous_match:
                            print('Match:', i + 1)
                            match_name = 'ES' + '_' + str(i + 1) + '_' + str(
                                simulate_seconds)
                            U.show_prob_dist(action_probs,
                                             show=True,
                                             color=color,
                                             max_y=max_y,
                                             action_num=self.action_num,
                                             save=True,
                                             name=match_name,
                                             count=0)
                            previous_match = i

                # if this is not the fisrt state, store things to buffer
                if prev_state is not None:
                    # try reward = self.obs.reward
                    reward = self.obs.reward
                    if verbose:
                        print(prev_state, prev_add_state, prev_action, state,
                              reward, prev_value, value)
                    self.local_buffer.append_more_more(
                        prev_state, prev_add_state, prev_map_state,
                        prev_action, state, reward, prev_value, value)

                self.mini_step(action)
                simulate_seconds += self.policy_wait_secs
                # the evn step to new states

                prev_state = state
                prev_action = action
                prev_value = value
                prev_add_state = add_state
                prev_map_state = map_state

                self.policy_flag = False

            if self.is_end:
                # get the last state and reward
                # get the state
                state = self.mapping_source_to_mini_by_rule(
                    self.get_the_input_right(self.obs))
                map_state = U.get_small_simple_map_data(self.obs)
                add_state = self.get_add_state(
                    self.get_the_input_right(self.obs))

                if self.ob_space_add == 0:
                    add_state = self.dummy_add_state
                    map_state = self.dummy_map_state = np.zeros([1, 1, 1])

                value = self.net.policy.get_values(state, add_state, map_state)
                # the value of the last state is defined somewhat different
                value = self.get_values_right(value)

                # if this is not the fisrt state, store things to buffer
                if prev_state is not None:
                    reward = self.obs.reward
                    if verbose:
                        print(prev_state, prev_add_state, prev_action, state,
                              reward, prev_value, value)
                    self.local_buffer.append_more_more(
                        prev_state, prev_add_state, prev_map_state,
                        prev_action, state, reward, prev_value, value)
                break

        if self.rl_training:
            #print(self.local_buffer.values)
            #print(self.local_buffer.values_next)
            #print(self.local_buffer.rewards)
            self.global_buffer.add(self.local_buffer)
            print("add map bn:")
            print("add %d buffer!" % (len(self.local_buffer.rewards)))