def learn(self, states, actions, action_probs, returns, next_states):
        values = self.values(states)

        advantages = returns - values
        advantages_normalized = (advantages - advantages.mean()) / (
            advantages.std() + 1.0e-10)
        # print(f"returns[0]: {returns[0]}")
        # print(f"values[0]: {values[0]}")
        # print(f"advantages[0]: {advantages[0]}")
        # print(f"advantages_normalized[0]: {advantages_normalized[0]}")

        policy_objectives = []
        value_losses = []
        entropy_values = []
        for _ in range(self.optimization_epochs):
            batcher = Batcher(num_data_points=len(states),
                              batch_size=self.batch_size)
            batcher.shuffle()
            for batch in batcher.batches():
                sampled_advantages = advantages_normalized[batch]
                sampled_states = states[batch]
                sampled_action_probs = action_probs[batch]
                sampled_actions = actions[batch]
                sampled_returns = returns[batch]

                policy_objective, value_loss, entropy_value = self.learn_from_samples(
                    sampled_advantages=sampled_advantages,
                    sampled_states=sampled_states,
                    sampled_action_probs=sampled_action_probs,
                    sampled_actions=sampled_actions,
                    sampled_returns=sampled_returns)

                policy_objectives.append(policy_objective)
                value_losses.append(value_loss)
                entropy_values.append(entropy_value)

        # the clipping parameter reduces as time goes on
        # self.epsilon *= 0.999

        # the regulation term also reduces
        # this reduces exploration in later runs
        self.entropy_weight *= self.entropy_reduction_rate

        return np.mean(policy_objectives), np.mean(value_losses), np.mean(
            entropy_values)
示例#2
0
        from att_lstm_model import AttLSTM_Model

        model = AttLSTM_Model(num_classes,max_length,len(dicts['token2id']),embd\
              ,emb_dim = input_dim,hidden_dim=hidden_dim,lr=0.001,num_ways=num_ways)

num_back = 0

step_per_epoch = train_batcher.max_batch_num

best_acc = 0

best_rep = ""

best_dev_rep = ""

train_batcher.shuffle()

for epoch in range(num_epoch):

    loss = 0.0

    print "Epoch %d" % epoch

    for i in range(step_per_epoch):

        random_ = np.random.random_sample()

        if not use_concepts:

            input_x, y, targets, lengths = train_batcher.next()
示例#3
0
[x_context_data, x_target_mean_data, y_data, feature_data] = test_batcher.next()
test_feed = {y:y_data,keep_prob_context:[1],keep_prob_target:[1],feature:feature_data}
for i in range(args.context_length*2+1):
    test_feed[x_context[i]] = x_context_data[:,i,:]
test_feed[x_target] = x_target_mean_data

[x_context_data, x_target_mean_data, y_data, feature_data] = dev_batcher.next()
dev_feed = {y:y_data,keep_prob_context:[1],keep_prob_target:[1],feature:feature_data}
for i in range(args.context_length*2+1):
    dev_feed[x_context[i]] = x_context_data[:,i,:]
dev_feed[x_target] = x_target_mean_data



ite = 0
train_batcher.shuffle()
# TRAINING
l = 0.
for step in range(50000001):
    [x_context_data, x_target_mean_data, y_data, feature_data] = train_batcher.next()
    feed = {y:y_data,keep_prob_context:[args.keep_prob_context],keep_prob_target:[args.keep_prob_target],feature:feature_data}
    for i in range(args.context_length*2+1):
        feed[x_context[i]] = x_context_data[:,i,:]
    feed[x_target] = x_target_mean_data
    sess.run(optimizer,feed_dict = feed)
    l += sess.run(loss,feed_dict = feed)
    
    
    if step % 50 == 0 and step > 1:
        ite += 1
        #print step,l/100.
示例#4
0
    def step(self):
        rollout = []
        hyperparameters = self.hyperparameters

        env_info = self.environment.reset(train_mode=True)[self.brain_name]    
        self.states = env_info.vector_observations  
        states = self.states

        for _ in range(hyperparameters['rollout_length']):
            actions, log_probs, _, values = self.network(states)
            env_info = self.environment.step(actions.cpu().detach().numpy())[self.brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            terminals = np.array([1 if t else 0 for t in env_info.local_done])
            self.all_rewards += rewards
            
            for i, terminal in enumerate(terminals):
                if terminals[i]:
                    self.episode_rewards.append(self.all_rewards[i])
                    self.all_rewards[i] = 0
                    
            rollout.append([states, values.detach(), actions.detach(), log_probs.detach(), rewards, 1 - terminals])
            states = next_states

        self.states = states
        pending_value = self.network(states)[-1]
        rollout.append([states, pending_value, None, None, None, None])

        processed_rollout = [None] * (len(rollout) - 1)
        advantages = torch.Tensor(np.zeros((self.config['environment']['number_of_agents'], 1)))
        returns = pending_value.detach()
        for i in reversed(range(len(rollout) - 1)):
            states, value, actions, log_probs, rewards, terminals = rollout[i]
            terminals = torch.Tensor(terminals).unsqueeze(1)
            rewards = torch.Tensor(rewards).unsqueeze(1)
            actions = torch.Tensor(actions.cpu())
            states = torch.Tensor(states)
            next_value = rollout[i + 1][1]
            returns = rewards + hyperparameters['discount_rate'] * terminals * returns.cpu()
            td_error = rewards + hyperparameters['discount_rate'] * terminals * next_value.detach().cpu() - value.detach().cpu()

            advantages = advantages * hyperparameters['tau'] * hyperparameters['discount_rate'] * terminals + td_error
            processed_rollout[i] = [states, actions, log_probs, returns, advantages]

        states, actions, log_probs_old, returns, advantages = map(lambda x: torch.cat(x, dim=0), zip(*processed_rollout))
        advantages = (advantages - advantages.mean()) / advantages.std()

        batcher = Batcher(states.size(0) // hyperparameters['mini_batch_number'], [np.arange(states.size(0))])
        for _ in range(hyperparameters['optimization_epochs']):
            batcher.shuffle()
            while not batcher.end():
                batch_indices = batcher.next_batch()[0]
                batch_indices = torch.Tensor(batch_indices).long()
                sampled_states = states[batch_indices]
                sampled_actions = actions[batch_indices]
                sampled_log_probs_old = log_probs_old[batch_indices]
                sampled_returns = returns[batch_indices]
                sampled_advantages = advantages[batch_indices]

                _, log_probs, entropy_loss, values = self.network(sampled_states, sampled_actions)
                ratio = (log_probs - sampled_log_probs_old).exp()
                obj = ratio * sampled_advantages
                obj_clipped = ratio.clamp(1.0 - hyperparameters['ppo_clip'],
                                          1.0 + hyperparameters['ppo_clip']) * sampled_advantages
                policy_loss = -torch.min(obj, obj_clipped).mean(0) - hyperparameters['entropy_coefficent'] * entropy_loss.mean()
                value_loss = 0.5 * (sampled_returns - values.cpu()).pow(2).mean()

                self.optimizier.zero_grad()
                (policy_loss + value_loss).backward()
                nn.utils.clip_grad_norm_(self.network.parameters(), hyperparameters['gradient_clip'])
                self.optimizier.step()

        steps = hyperparameters['rollout_length'] * self.config['environment']['number_of_agents']
        self.total_steps += steps