def select_action(self, observation): stpt_actions = [] therm_actions = [] blind_actions = [] idxs = [] for i in range(0, self.num_sat_actions): rand = np.random.choice(self.stpt_action_space) stpt_actions.append(rand) idxs.append(np.where(self.stpt_action_space == rand)[0].item()) for i in range(0, self.num_therm_actions): rand = np.random.choice(self.therm_action_space) therm_actions.append(rand) idxs.append(np.where(self.therm_action_space == rand)[0].item()) for i in range(0, self.num_blind_actions): rand = np.random.choice(self.blind_action_space) blind_actions.append(rand) idxs.append(np.where(self.blind_action_space == rand)[0].item()) sat_actions_tups = [] for a in stpt_actions: action_stpt, sat_sp = augment_ma(observation, a) sat_actions_tups.append((action_stpt, sat_sp)) for i in range(len(idxs)): observation[1][f"Action idx {i}"] = idxs[i] return sat_actions_tups, therm_actions, blind_actions, idxs
def agent_start(self, state): action, blind_action = self.policy_old.act(state[0], self.memory) self.last_action = action self.last_state = state if self.discrete: action = self.action_space[action] blind_action = self.blind_action_space[blind_action] action_stpt, sat_sp = augment_ma(state, action) return action_stpt, sat_sp, blind_action
def choose_action(self, observation): if np.random.random() > self.epsilon: state, _, _ = observation actions = self.q_eval.forward(state) action_idx = T.argmax(actions).item() action = self.action_space[action_idx] else: action = np.random.choice(self.action_space) action, sat_sp = augment_ma(observation, action) return action, sat_sp
def choose_action(self, observation): if np.random.random() > self.epsilon: # state = T.tensor([observation],dtype=T.float).to(self.q_eval.device) # actions = self.q_eval.forward(state) # action = T.argmax(actions).item() state, _, _ = observation actions = self.q_eval.forward(state) action_idx = T.argmax(actions).item() action = self.action_space[action_idx] else: action = np.random.choice(self.action_space) action, sat_sp = augment_ma(observation, action) return action, sat_sp
def select_action(self, observation, evaluate=False): """ :param observation: :param evaluate: :return: action is for the reward function sat_sp is whats used by env_step and by the model for training """ actions = [] if self.start_steps > self.total_numsteps: for i in range(self.action_space.shape[0]): a = np.random.uniform(self.action_space[i].min(), self.action_space[i].max()) actions.append(a) else: state, _, _ = observation state = torch.FloatTensor(state.float()).to( self.device).unsqueeze(0) if evaluate is False: action, _, _ = self.policy.sample(state) else: _, _, action = self.policy.sample(state) for i in range(self.action_space.shape[0]): a = action.detach().cpu().numpy()[0][i] actions.append(a) sat_actions = actions[0:self.num_sat_actions] therm_actions = actions[self.num_sat_actions:self.num_therm_actions + self.num_sat_actions] blind_actions = actions[self.num_therm_actions + self.num_sat_actions:] sat_actions_tups = [] for a in sat_actions: action_stpt, sat_sp = augment_ma(observation, a) sat_actions_tups.append((action_stpt, sat_sp)) if len(sat_actions) == 0: # this is hacky but makes the parsing in the main file cleaner sat_actions_tups.append(([], [])) return sat_actions_tups, therm_actions, blind_actions, actions
def select_action(self, observation): if np.random.random() > self.epsilon: state, _, _ = observation actions_stpt, actions_blinds = self.q_eval.forward(state) # setpoint actions actions_stpt_idx = T.argmax(actions_stpt).item() action_stpt = self.action_space[0][actions_stpt_idx] # blind actions actions_blinds_idx = T.argmax(actions_blinds).item() action_blinds = self.action_space[1][actions_blinds_idx] else: action_stpt = np.random.choice(self.action_space[0]) action_blinds = np.random.choice(self.action_space[1]) action_stpt, sat_sp = augment_ma(observation, action_stpt) return action_stpt, sat_sp, action_blinds
def agent_start(self, state): action_idx = self.policy_old.act(state[0], self.memory) self.last_action = action_idx[:] self.last_state = state actions = list() for i, action in enumerate(action_idx): actions.append(self.action_space[i][action]) sat_actions = actions[:self.num_sat_actions] therm_actions = actions[self.num_sat_actions:self.num_therm_actions + self.num_sat_actions] blind_actions = actions[self.num_therm_actions + self.num_sat_actions:] sat_actions_tups = [] for action in sat_actions: action_stpt, sat_sp = augment_ma(state, action) sat_actions_tups.append((action_stpt, sat_sp)) if len(sat_actions) == 0: # this is hacky but makes the parsing in the main file cleaner sat_actions_tups.append(([], [])) return sat_actions_tups, therm_actions, blind_actions