def play_train_mini(self, verbose=False): is_attack = False state_last = None self.safe_action(C._NO_OP, 0, []) self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) self._gases = U.find_initial_gases(self.obs) while True: self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) if self.policy_flag and (not self.is_end): state_now = self.mapping_source_to_mini_by_rule(self.get_the_input()) if self.greedy_action: action_prob, v_preds = self.net.policy.get_action_probs(state_now, verbose=False) action = np.argmax(action_prob) else: action, v_preds = self.net.policy.get_action(state_now, verbose=False) # print(ProtossAction(action).name) self.mini_step(action) if state_last is not None: if 0: print('state_last:', state_last, ', action_last:', action_last, ', state_now:', state_now) v_preds_next = self.net.policy.get_values(state_now) v_preds_next = self.get_values(v_preds_next) reward = 0 self.local_buffer.append(state_last, action_last, state_now, reward, v_preds, v_preds_next) # continuous attack, consistent with mind-game if action == ProtossAction.Attack.value: is_attack = True if is_attack: self.mini_step(ProtossAction.Attack.value) state_last = state_now action_last = action self.policy_flag = False if self.is_end: if self.rl_training: self.local_buffer.rewards[-1] += 1 * self.result['reward'] # self.result['win'] print(self.local_buffer.rewards) self.global_buffer.add(self.local_buffer) print("add %d buffer!" % (len(self.local_buffer.rewards))) break
def play_right(self, verbose=False): # note this is a right version of game play prev_state = None prev_action = None prev_value = None self.safe_action(C._NO_OP, 0, []) self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) self._gases = U.find_initial_gases(self.obs) while True: self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) if self.policy_flag and (not self.is_end): # get the state state = self.mapping_source_to_mini_by_rule( self.get_the_input_right(self.obs)) # get the action and value accoding to state action, value = self.net.policy.get_action(state, verbose=verbose) # if this is not the fisrt state, store things to buffer if prev_state is not None: # try reward = self.obs.reward reward = self.obs.reward if verbose: print(prev_state, prev_action, state, reward, prev_value, value) self.local_buffer.append(prev_state, prev_action, state, reward, prev_value, value) self.mini_step(action) # the evn step to new states prev_state = state prev_action = action prev_value = value self.policy_flag = False if self.is_end: # get the last state and reward # get the state state = self.mapping_source_to_mini_by_rule( self.get_the_input_right(self.obs)) value = self.net.policy.get_values(state) # the value of the last state is defined somewhat different value = self.get_values_right(value) # if this is not the fisrt state, store things to buffer if prev_state is not None: reward = self.obs.reward if verbose: print(prev_state, prev_action, state, reward, prev_value, value) self.local_buffer.append(prev_state, prev_action, state, reward, prev_value, value) break if self.rl_training: if verbose: print(self.local_buffer.values) print(self.local_buffer.values_next) #print(self.local_buffer.rewards) self.global_buffer.add(self.local_buffer)
def sample(self, verbose=False, use_image=False): is_attack = False state_last = None random_generated_int = random.randint(0, 2**31 - 1) filename = self.extract_save_dir + "/" + str( random_generated_int) + ".npz" recording_obs = [] recording_img = [] recording_action = [] np.random.seed(random_generated_int) tf.set_random_seed(random_generated_int) self.safe_action(C._NO_OP, 0, []) self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) self._gases = U.find_initial_gases(self.obs) while True: self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) if self.policy_flag and (not self.is_end): state_now = self.mapping_source_to_mini_by_rule( self.get_the_input()) recording_obs.append(state_now) if use_image: recording_img.append(U.get_simple_map_data(self.obs)) action, v_preds = self.net.policy.get_action(state_now, verbose=False) recording_action.append(action) self.mini_step(action) if state_last is not None: if False: print('state_last:', state_last, ', action_last:', action_last, ', state_now:', state_now) v_preds_next = self.net.policy.get_values(state_now) v_preds_next = self.get_values(v_preds_next) reward = 0 self.local_buffer.append(state_last, action_last, state_now, reward, v_preds, v_preds_next) state_last = state_now action_last = action self.policy_flag = False if self.is_end: if True: #note this will not consider the minerals larger than 256! recording_obs = np.array(recording_obs, dtype=np.uint16) recording_action = np.array(recording_action, dtype=np.uint8) if not use_image: np.savez_compressed(filename, obs=recording_obs, action=recording_action) else: recording_img = np.array(recording_img, dtype=np.float16) np.savez_compressed(filename, obs=recording_obs, img=recording_img, action=recording_action) break
def play_right_add(self, verbose=False): # note this is a right version of game play, which also add input and action prev_state = None prev_action = None prev_value = None prev_add_state = None prev_map_state = None show_image = False self.safe_action(C._NO_OP, 0, []) self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) self._gases = U.find_initial_gases(self.obs) self.dummy_add_state = np.zeros(1) self.dummy_map_state = np.zeros([1, 1, 1]) simulate_seconds = 0 feature_dict = U.edge_state() previous_match = -1 while True: self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) if self.policy_flag and (not self.is_end): # get the state state = self.mapping_source_to_mini_by_rule( self.get_the_input_right(self.obs)) if self.image_debug and verbose and self.step % C.time_wait( self.image_wait_secs) == 0: show_image = True else: show_image = False if verbose: print('show_image:', show_image) map_state = U.get_small_simple_map_data( self.obs, show_image, show_image) if verbose: print('map_state.shape:', map_state.shape) add_state = self.get_add_state( self.get_the_input_right(self.obs)) # get the action and value accoding to state #print("add_state:", add_state) if self.ob_space_add == 0: add_state = self.dummy_add_state map_state = self.dummy_map_state = np.zeros([1, 1, 1]) action, action_probs, value = self.net.policy.get_act_action_probs( state, add_state, map_state, verbose=verbose) # if this is not the fisrt state, store things to buffer if prev_state is not None: # try reward = self.obs.reward reward = self.obs.reward if verbose: print(prev_state, prev_add_state, prev_action, state, reward, prev_value, value) self.local_buffer.append_more_more( prev_state, prev_add_state, prev_map_state, prev_action, state, reward, prev_value, value) self.mini_step(action) simulate_seconds += self.policy_wait_secs # the evn step to new states prev_state = state prev_action = action prev_value = value prev_add_state = add_state prev_map_state = map_state self.policy_flag = False if self.is_end: # get the last state and reward # get the state state = self.mapping_source_to_mini_by_rule( self.get_the_input_right(self.obs)) map_state = U.get_small_simple_map_data(self.obs) add_state = self.get_add_state( self.get_the_input_right(self.obs)) if self.ob_space_add == 0: add_state = self.dummy_add_state map_state = self.dummy_map_state = np.zeros([1, 1, 1]) value = self.net.policy.get_values(state, add_state, map_state) # the value of the last state is defined somewhat different value = self.get_values_right(value) # if this is not the fisrt state, store things to buffer if prev_state is not None: reward = self.obs.reward if verbose: print(prev_state, prev_add_state, prev_action, state, reward, prev_value, value) self.local_buffer.append_more_more( prev_state, prev_add_state, prev_map_state, prev_action, state, reward, prev_value, value) break if self.rl_training: #print(self.local_buffer.values) #print(self.local_buffer.values_next) #print(self.local_buffer.rewards) self.global_buffer.add(self.local_buffer) print("add map bn:") print("add %d buffer!" % (len(self.local_buffer.rewards)))
def play_train(self, continues_attack=False, verbose=False): is_attack = False state_last = None self.safe_action(C._NO_OP, 0, []) self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) self._gases = U.find_initial_gases(self.obs) while True: self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) if self.policy_flag and (not self.is_end): non_image_feature = self.mapping_source_to_mini_by_rule( self.get_the_input()) #print('non_image_feature.shape:', non_image_feature.shape) #print('non_image_feature:', non_image_feature) image_feature = U.get_simple_map_data(self.obs) #print('image_feature.shape:', image_feature.shape) #print('image_feature:', image_feature) latent_image_feature, mu, logvar = self.encode_obs( image_feature) #print('latent_image_feature.shape:', latent_image_feature.shape) #print('latent_image_feature:', latent_image_feature) feature = np.concatenate( [non_image_feature, latent_image_feature], axis=-1) #print('feature.shape:', feature.shape) #print('feature:', feature) #state_now = feature reward_last = 0 state_now, action, v_preds = self.get_action( feature, reward_last) # print(ProtossAction(action).name) self.mini_step(action) if state_last is not None: if 0: print('state_last:', state_last, ', action_last:', action_last, ', state_now:', state_now) v_preds_next = self.net.policy.get_values(state_now) v_preds_next = self.get_values(v_preds_next) reward = 0 self.local_buffer.append(state_last, action_last, state_now, reward, v_preds, v_preds_next) state_last = state_now action_last = action self.policy_flag = False if self.is_end: if self.rl_training: self.local_buffer.rewards[-1] += 1 * self.result[ 'reward'] # self.result['win'] #print(self.local_buffer.rewards) self.global_buffer.add(self.local_buffer) #print("add %d buffer!" % (len(self.local_buffer.rewards))) break
def sample(self, verbose=False, use_image=True): is_attack = False state_last = None random_generated_int = random.randint(0, 2**31 - 1) filename = self.extract_save_dir + "/" + str( random_generated_int) + ".npz" recording_obs = [] recording_img = [] recording_action = [] recording_reward = [] np.random.seed(random_generated_int) tf.set_random_seed(random_generated_int) self.safe_action(C._NO_OP, 0, []) self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) self._gases = U.find_initial_gases(self.obs) while True: self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) if self.policy_flag and (not self.is_end): non_image_feature = self.mapping_source_to_mini_by_rule( self.get_the_input()) #print('non_image_feature.shape:', non_image_feature.shape) #print('non_image_feature:', non_image_feature) image_feature = U.get_simple_map_data(self.obs) #print('image_feature.shape:', image_feature.shape) #print('image_feature:', image_feature) latent_image_feature, mu, logvar = self.encode_obs( image_feature) #print('latent_image_feature.shape:', latent_image_feature.shape) #print('latent_image_feature:', latent_image_feature) feature = np.concatenate( [non_image_feature, latent_image_feature], axis=-1) #print('feature.shape:', feature.shape) #print('feature:', feature) #state_now = feature reward_last = 0 state_now, action, v_preds = self.get_action( feature, reward_last) # print(ProtossAction(action).name) self.mini_step(action) if state_last is not None: if False: print('state_last:', state_last, ', action_last:', action_last, ', state_now:', state_now) v_preds_next = self.net.policy.get_values(state_now) v_preds_next = self.get_values(v_preds_next) reward = 0 recording_obs.append(non_image_feature) recording_img.append(image_feature) recording_action.append(action) recording_reward.append(reward) #self.local_buffer.append(state_last, action_last, state_now, reward, v_preds, v_preds_next) state_last = state_now action_last = action self.policy_flag = False if self.is_end: if True: # consider the win/loss, to 0(not end), 1(loss), 2(draw), 3(win) recording_reward[-1] = (1 * self.result['reward'] + 2) if recording_reward[-1] != 0: print("result is:", recording_reward[-1]) recording_obs = np.array(recording_obs, dtype=np.uint16) recording_action = np.array(recording_action, dtype=np.uint8) recording_reward = np.array(recording_reward, dtype=np.uint8) recording_img = np.array(recording_img, dtype=np.float16) np.savez_compressed(filename, obs=recording_obs, img=recording_img, action=recording_action, reward=recording_reward) break
def play_right_add(self, verbose=False): # note this is a right version of game play, which also add input and action prev_state = None prev_action = None prev_value = None prev_add_state = None prev_map_state = None show_image = False self.safe_action(C._NO_OP, 0, []) self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) self._gases = U.find_initial_gases(self.obs) self.dummy_add_state = np.zeros(1) self.dummy_map_state = np.zeros([1, 1, 1]) simulate_seconds = 0 feature_dict = U.edge_state() previous_match = -1 while True: self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) if self.policy_flag and (not self.is_end): # get the state state = self.mapping_source_to_mini_by_rule( self.get_the_input_right(self.obs)) if self.image_debug and verbose and self.step % C.time_wait( self.image_wait_secs) == 0: show_image = True else: show_image = False if verbose: print('show_image:', show_image) map_state = U.get_small_simple_map_data( self.obs, show_image, show_image) if verbose: print('map_state.shape:', map_state.shape) add_state = self.get_add_state( self.get_the_input_right(self.obs)) # get the action and value accoding to state #print("add_state:", add_state) if self.ob_space_add == 0: add_state = self.dummy_add_state map_state = self.dummy_map_state = np.zeros([1, 1, 1]) action, action_probs, value = self.net.policy.get_act_action_probs( state, add_state, map_state, verbose=verbose) if self.probe_debug and verbose and self.step % C.time_wait( self.prove_save_wait_secs) == 0: save_prob = True else: save_prob = False if self.action_prob_debug and verbose: print('self.step:', self.step) print(self.prob_show_wait_seconds) print('simulate_seconds:', simulate_seconds) use_TG = True if use_TG: bar_type = 'TG' max_y = 0.3 color = 'b' else: bar_type = 'RG' max_y = 0.5 color = 'r' bar_name = bar_type + '_' + str(simulate_seconds) if True and simulate_seconds in self.prob_show_wait_seconds: pprint.pprint(action_probs) U.show_prob_dist(action_probs, show=True, color=color, max_y=max_y, action_num=self.action_num, save=False, name=bar_name, count=0) if self.edge_state_debug and verbose: match_list = U.calculate_state_mapping(state, feature_dict) print('state:', state) print('match_list:', match_list) for i, match in enumerate(match_list): if match and i != previous_match: print('Match:', i + 1) match_name = 'ES' + '_' + str(i + 1) + '_' + str( simulate_seconds) U.show_prob_dist(action_probs, show=True, color=color, max_y=max_y, action_num=self.action_num, save=True, name=match_name, count=0) previous_match = i # if this is not the fisrt state, store things to buffer if prev_state is not None: # try reward = self.obs.reward reward = self.obs.reward if verbose: print(prev_state, prev_add_state, prev_action, state, reward, prev_value, value) self.local_buffer.append_more_more( prev_state, prev_add_state, prev_map_state, prev_action, state, reward, prev_value, value) self.mini_step(action) simulate_seconds += self.policy_wait_secs # the evn step to new states prev_state = state prev_action = action prev_value = value prev_add_state = add_state prev_map_state = map_state self.policy_flag = False if self.is_end: # get the last state and reward # get the state state = self.mapping_source_to_mini_by_rule( self.get_the_input_right(self.obs)) map_state = U.get_small_simple_map_data(self.obs) add_state = self.get_add_state( self.get_the_input_right(self.obs)) if self.ob_space_add == 0: add_state = self.dummy_add_state map_state = self.dummy_map_state = np.zeros([1, 1, 1]) value = self.net.policy.get_values(state, add_state, map_state) # the value of the last state is defined somewhat different value = self.get_values_right(value) # if this is not the fisrt state, store things to buffer if prev_state is not None: reward = self.obs.reward if verbose: print(prev_state, prev_add_state, prev_action, state, reward, prev_value, value) self.local_buffer.append_more_more( prev_state, prev_add_state, prev_map_state, prev_action, state, reward, prev_value, value) break if self.rl_training: #print(self.local_buffer.values) #print(self.local_buffer.values_next) #print(self.local_buffer.rewards) self.global_buffer.add(self.local_buffer) print("add map bn:") print("add %d buffer!" % (len(self.local_buffer.rewards)))