def play_right_add(self, verbose=False): # note this is a right version of game play, which also add input and action prev_state = None prev_action = None prev_value = None prev_add_state = None prev_map_state = None show_image = False self.safe_action(C._NO_OP, 0, []) self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) self._gases = U.find_initial_gases(self.obs) self.dummy_add_state = np.zeros(1) self.dummy_map_state = np.zeros([1, 1, 1]) simulate_seconds = 0 feature_dict = U.edge_state() previous_match = -1 while True: self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) if self.policy_flag and (not self.is_end): # get the state state = self.mapping_source_to_mini_by_rule( self.get_the_input_right(self.obs)) if self.image_debug and verbose and self.step % C.time_wait( self.image_wait_secs) == 0: show_image = True else: show_image = False if verbose: print('show_image:', show_image) map_state = U.get_small_simple_map_data( self.obs, show_image, show_image) if verbose: print('map_state.shape:', map_state.shape) add_state = self.get_add_state( self.get_the_input_right(self.obs)) # get the action and value accoding to state #print("add_state:", add_state) if self.ob_space_add == 0: add_state = self.dummy_add_state map_state = self.dummy_map_state = np.zeros([1, 1, 1]) action, action_probs, value = self.net.policy.get_act_action_probs( state, add_state, map_state, verbose=verbose) # if this is not the fisrt state, store things to buffer if prev_state is not None: # try reward = self.obs.reward reward = self.obs.reward if verbose: print(prev_state, prev_add_state, prev_action, state, reward, prev_value, value) self.local_buffer.append_more_more( prev_state, prev_add_state, prev_map_state, prev_action, state, reward, prev_value, value) self.mini_step(action) simulate_seconds += self.policy_wait_secs # the evn step to new states prev_state = state prev_action = action prev_value = value prev_add_state = add_state prev_map_state = map_state self.policy_flag = False if self.is_end: # get the last state and reward # get the state state = self.mapping_source_to_mini_by_rule( self.get_the_input_right(self.obs)) map_state = U.get_small_simple_map_data(self.obs) add_state = self.get_add_state( self.get_the_input_right(self.obs)) if self.ob_space_add == 0: add_state = self.dummy_add_state map_state = self.dummy_map_state = np.zeros([1, 1, 1]) value = self.net.policy.get_values(state, add_state, map_state) # the value of the last state is defined somewhat different value = self.get_values_right(value) # if this is not the fisrt state, store things to buffer if prev_state is not None: reward = self.obs.reward if verbose: print(prev_state, prev_add_state, prev_action, state, reward, prev_value, value) self.local_buffer.append_more_more( prev_state, prev_add_state, prev_map_state, prev_action, state, reward, prev_value, value) break if self.rl_training: #print(self.local_buffer.values) #print(self.local_buffer.values_next) #print(self.local_buffer.rewards) self.global_buffer.add(self.local_buffer) print("add map bn:") print("add %d buffer!" % (len(self.local_buffer.rewards)))
def set_flag(self): if self.step % C.time_wait(self.strategy_wait_secs) == 1: self.strategy_flag = True if self.step % C.time_wait(self.policy_wait_secs) == 1: self.policy_flag = True
def play_right_add(self, verbose=False): # note this is a right version of game play, which also add input and action prev_state = None prev_action = None prev_value = None prev_add_state = None prev_map_state = None show_image = False self.safe_action(C._NO_OP, 0, []) self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) self._gases = U.find_initial_gases(self.obs) self.dummy_add_state = np.zeros(1) self.dummy_map_state = np.zeros([1, 1, 1]) simulate_seconds = 0 feature_dict = U.edge_state() previous_match = -1 while True: self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) if self.policy_flag and (not self.is_end): # get the state state = self.mapping_source_to_mini_by_rule( self.get_the_input_right(self.obs)) if self.image_debug and verbose and self.step % C.time_wait( self.image_wait_secs) == 0: show_image = True else: show_image = False if verbose: print('show_image:', show_image) map_state = U.get_small_simple_map_data( self.obs, show_image, show_image) if verbose: print('map_state.shape:', map_state.shape) add_state = self.get_add_state( self.get_the_input_right(self.obs)) # get the action and value accoding to state #print("add_state:", add_state) if self.ob_space_add == 0: add_state = self.dummy_add_state map_state = self.dummy_map_state = np.zeros([1, 1, 1]) action, action_probs, value = self.net.policy.get_act_action_probs( state, add_state, map_state, verbose=verbose) if self.probe_debug and verbose and self.step % C.time_wait( self.prove_save_wait_secs) == 0: save_prob = True else: save_prob = False if self.action_prob_debug and verbose: print('self.step:', self.step) print(self.prob_show_wait_seconds) print('simulate_seconds:', simulate_seconds) use_TG = True if use_TG: bar_type = 'TG' max_y = 0.3 color = 'b' else: bar_type = 'RG' max_y = 0.5 color = 'r' bar_name = bar_type + '_' + str(simulate_seconds) if True and simulate_seconds in self.prob_show_wait_seconds: pprint.pprint(action_probs) U.show_prob_dist(action_probs, show=True, color=color, max_y=max_y, action_num=self.action_num, save=False, name=bar_name, count=0) if self.edge_state_debug and verbose: match_list = U.calculate_state_mapping(state, feature_dict) print('state:', state) print('match_list:', match_list) for i, match in enumerate(match_list): if match and i != previous_match: print('Match:', i + 1) match_name = 'ES' + '_' + str(i + 1) + '_' + str( simulate_seconds) U.show_prob_dist(action_probs, show=True, color=color, max_y=max_y, action_num=self.action_num, save=True, name=match_name, count=0) previous_match = i # if this is not the fisrt state, store things to buffer if prev_state is not None: # try reward = self.obs.reward reward = self.obs.reward if verbose: print(prev_state, prev_add_state, prev_action, state, reward, prev_value, value) self.local_buffer.append_more_more( prev_state, prev_add_state, prev_map_state, prev_action, state, reward, prev_value, value) self.mini_step(action) simulate_seconds += self.policy_wait_secs # the evn step to new states prev_state = state prev_action = action prev_value = value prev_add_state = add_state prev_map_state = map_state self.policy_flag = False if self.is_end: # get the last state and reward # get the state state = self.mapping_source_to_mini_by_rule( self.get_the_input_right(self.obs)) map_state = U.get_small_simple_map_data(self.obs) add_state = self.get_add_state( self.get_the_input_right(self.obs)) if self.ob_space_add == 0: add_state = self.dummy_add_state map_state = self.dummy_map_state = np.zeros([1, 1, 1]) value = self.net.policy.get_values(state, add_state, map_state) # the value of the last state is defined somewhat different value = self.get_values_right(value) # if this is not the fisrt state, store things to buffer if prev_state is not None: reward = self.obs.reward if verbose: print(prev_state, prev_add_state, prev_action, state, reward, prev_value, value) self.local_buffer.append_more_more( prev_state, prev_add_state, prev_map_state, prev_action, state, reward, prev_value, value) break if self.rl_training: #print(self.local_buffer.values) #print(self.local_buffer.values_next) #print(self.local_buffer.rewards) self.global_buffer.add(self.local_buffer) print("add map bn:") print("add %d buffer!" % (len(self.local_buffer.rewards)))