def select_action(state): state = torch.from_numpy(state).float().unsqueeze(0) probs = policy(Variable(state)) m = Multinomial(probs) action = m.sample() policy.saved_log_probs.append(m.log_prob(action)) return action.data[0]
def decide(self, choices: List[any]) -> int: inputs = list(map(lambda choice: torch.FloatTensor(choice), choices)) enhanced_features = list( map(lambda vec: self._base_network.model.forward(vec), inputs)) action_features = list( map(lambda vec: self._policy_gradient.model.forward(vec.detach()), enhanced_features)) # Get move probabilities = Function.softmax(torch.cat(list(action_features))) distribution = Multinomial(1, probabilities) move = distribution.sample() _, index_of_move = move.max(0) # Expected reward expected_reward = self._value_function.model( enhanced_features[index_of_move]) log_probability = distribution.log_prob(move) # Record estimate self.rounds.append( Round(value=expected_reward, log_probability=log_probability)) # Return return index_of_move.item()
def select_action(state): state = torch.from_numpy(state).float().unsqueeze(0) probs, state_value = model(Variable(state)) m = Multinomial(probs) action = m.sample() model.saved_actions.append(SavedAction(m.log_prob(action), state_value)) return action.data[0]
def select_action(state, variance=1, temp=10): # this function selects stochastic actions based on the policy probabilities state = torch.from_numpy(state).float().unsqueeze(0) action_scores = actor(state) prob = F.softmax(action_scores / temp, dim=1) # m = Multinomial(vaccine_supply, prob[0]) action = m.sample() log_prob = m.log_prob(action) entropy = -(log_prob * prob).sum(1, keepdim=True) return action.numpy(), log_prob, entropy
def select_action(state, variance=1, temp=10): state = torch.tensor(state, dtype=torch.float32).unsqueeze(0) action_scores = actor(state) print(action_scores, file=myLog) prob = F.softmax(action_scores / temp, dim=1) # #print('***',prob) m = Multinomial(vaccine_supply, prob[0]) #[0] action = m.sample() #print(action) log_prob = m.log_prob(action) entropy = -torch.sum(torch.log(prob) * prob, axis=-1) return action.numpy(), log_prob, entropy
def select_action(state, variance=1, temp=10): # this function selects stochastic actions based on the policy probabilities #state = torch.from_numpy(np.array(state)).float().unsqueeze(0) #Reza: this might be a bit faster torch.tensor(state,dtype=torch.float32).unsqueeze(0) state = torch.tensor(state, dtype=torch.float32).unsqueeze(0) action_scores = actor(state) print(action_scores, file=myLog) prob = F.softmax(action_scores / temp, dim=1) # #print('***',prob) m = Multinomial(vaccine_supply, prob[0]) #[0] action = m.sample() log_prob = m.log_prob(action) entropy = -torch.sum(torch.log(prob) * prob, axis=-1) return action.numpy(), log_prob, entropy
def select_action(self, state, temp=1): # this function selects stochastic actions based on the policy probabilities state = torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0) logits = self.actor(state) # TODO: check this one later logits_norm = (logits - torch.mean(logits)) / \ (torch.std(logits) + 1e-5) m = Multinomial(self.args.vaccine_supply, logits=logits_norm.squeeze() / temp) action = m.sample() log_prob = m.log_prob(action) entropy = -torch.sum(m.logits * m.probs) return action.to('cpu').numpy(), log_prob, entropy
def select_action(state, variance=1, temp=1): # this function selects stochastic actions based on the policy probabilities # state = torch.from_numpy(np.array(state)).float().unsqueeze(0) #Reza: this might be a bit faster torch.tensor(state,dtype=torch.float32).unsqueeze(0) state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0) action_scores = actor(state) action_scores_norm = (action_scores-torch.mean(action_scores))/\ (torch.std(action_scores)+1e-5) # print(action_scores, file=myLog) # prob = F.softmax(action_scores_norm , dim=1) # print('***',prob) m = Multinomial(vaccine_supply, logits=action_scores_norm.squeeze()/ temp) # m = Multinomial(vaccine_supply, prob[0]) # [0] action = m.sample() log_prob = m.log_prob(action) # entropy = - torch.sum(torch.log(prob) * prob, axis=-1) entropy = -torch.sum(m.logits* m.probs, axis=-1) return action.to('cpu').numpy(), log_prob, entropy
def evaluate(self, possible_boards): # possible_boards -> neural network -> sigmoid -> last_layer_sigmoid last_layer_outputs = self.run_through_neural_network(possible_boards) # last_layer_sigmoid = list(map(lambda x: x.sigmoid(), last_layer_outputs)) # Decide move and save log_prob for backward # We make sure not to affect the value fn with .detach() probs = self.pg_plugin._softmax(last_layer_outputs) distribution = Multinomial(1, probs) move = distribution.sample() self.saved_log_probabilities.append(distribution.log_prob(move)) _, move = move.max(0) # calculate the value estimation and save for backward value_estimate = self.pg_plugin.value_model(last_layer_outputs[move]) self.saved_value_estimations.append(value_estimate) return move