Exemplo n.º 1
0
    def forward(self,
                instruction=None,
                observation=None,
                memory=None,
                compute_message_probs=False,
                time=None):
        if not hasattr(self, 'random_corrector'):
            self.random_corrector = False
        if not hasattr(self, 'var_len'):
            self.var_len = False
        if not hasattr(self, 'script'):
            self.script = False

        if not self.script:
            memory_rnn_output, memory = self.forward_film(
                instruction=instruction,
                observation=observation,
                memory=memory)

            batch_size = instruction.size(0)
            correction_encodings = []

            entropy = 0.0

            lengths = np.array([self.corr_length] * batch_size)
            total_corr_loss = 0

            for i in range(self.corr_length):
                if i == 0:
                    # every message starts with a SOS token
                    decoder_input = torch.tensor([self.sos_id] * batch_size,
                                                 dtype=torch.long,
                                                 device=self.device)
                    decoder_input_embedded = self.word_embedding_corrector(
                        decoder_input).unsqueeze(1)
                    decoder_hidden = memory_rnn_output.unsqueeze(0)

                if self.random_corrector:
                    # randomize corrections
                    device = torch.device(
                        "cuda" if decoder_input_embedded.is_cuda else "cpu")
                    decoder_input_embedded = torch.randn(
                        decoder_input_embedded.size(), device=device)
                    decoder_hidden = torch.randn(decoder_hidden.size(),
                                                 device=device)

                rnn_output, decoder_hidden = self.decoder_rnn(
                    decoder_input_embedded, decoder_hidden)
                vocab_scores = self.out(rnn_output)
                vocab_probs = F.softmax(vocab_scores, dim=-1)

                entropy += Categorical(vocab_probs).entropy()

                tau = 1.0 / (self.tau_layer(decoder_hidden).squeeze(0) +
                             self.max_tau)
                tau = tau.expand(-1, self.num_embeddings).unsqueeze(1)

                if self.training:
                    # Apply Gumbel SM
                    cat_distr = RelaxedOneHotCategorical(tau, vocab_probs)
                    corr_weights = cat_distr.rsample()
                    corr_weights_hard = torch.zeros_like(corr_weights,
                                                         device=self.device)
                    corr_weights_hard.scatter_(
                        -1, torch.argmax(corr_weights, dim=-1, keepdim=True),
                        1.0)

                    # detach() detaches the output from the computation graph, so no gradient will be backprop'ed along this variable
                    corr_weights = (corr_weights_hard -
                                    corr_weights).detach() + corr_weights

                else:
                    # greedy sample
                    corr_weights = torch.zeros_like(vocab_probs,
                                                    device=self.device)
                    corr_weights.scatter_(
                        -1, torch.argmax(vocab_probs, dim=-1, keepdim=True),
                        1.0)

                if self.var_len:
                    # consider sequence done when eos receives highest value
                    max_idx = torch.argmax(corr_weights, dim=-1)
                    eos_batches = max_idx.data.eq(self.eos_id)
                    if eos_batches.dim() > 0:
                        eos_batches = eos_batches.cpu().view(-1).numpy()
                        update_idx = ((lengths > i) & eos_batches) != 0
                        lengths[update_idx] = i

                    # compute correction error through pseudo-target: sequence of eos symbols to encourage short messages
                    pseudo_target = torch.tensor(
                        [self.eos_id for j in range(batch_size)],
                        dtype=torch.long,
                        device=self.device)
                    loss = self.correction_loss(corr_weights.squeeze(1),
                                                pseudo_target)
                    total_corr_loss += loss

                correction_encodings += [corr_weights]
                decoder_input_embedded = torch.matmul(
                    corr_weights, self.word_embedding_corrector.weight)

            # one-hot vectors on forward, soft approximations on backward pass
            correction_encodings = torch.stack(correction_encodings,
                                               dim=1).squeeze(2)

            lengths = torch.tensor(lengths,
                                   dtype=torch.long,
                                   device=self.device)

            result = {
                'correction_encodings':
                correction_encodings,
                'correction_messages':
                self.decode_corrections(correction_encodings),
                'correction_entropy':
                torch.mean(entropy),
                'corrector_memory':
                memory,
                'correction_lengths':
                lengths,
                'correction_loss':
                total_corr_loss
            }

        else:
            # there is a script of pre-established guidance messages
            correction_messages = self.script[time]
            correction_encodings = self.encode_corrections(correction_messages)
            result = {
                'correction_encodings': correction_encodings,
                'correction_messages': correction_messages
            }

        return (result)
Exemplo n.º 2
0
 def forward(self, x, h):
     x, h = self.gru(x, h)
     x = self.relu1(x)
     val = self.value(x)
     dist = self.policy(x).squeeze(0)
     return Categorical(logits=dist), val, h
Exemplo n.º 3
0
 def dist(self, x: torch.Tensor) -> Categorical:
     h = self._encoder(x)
     h = self._fc(h)
     return Categorical(torch.softmax(h, dim=1))
 def sample(self, datas):
     distribution = Categorical(datas)
     return distribution.sample().float().to(device)
 def logprob(self, datas, value_data):
     distribution = Categorical(datas)
     return distribution.log_prob(value_data).float().to(device)
Exemplo n.º 6
0
import torch
from torch.distributions import Categorical

p_tensor = torch.Tensor([0.25, 0.25, 0.25, 0.25,0.7, 0.1, 0.1, 0.1,0.25, 0.25, 0.25, 0.25,0.7, 0.1, 0.1, 0.1])
print(p_tensor.shape)
p_tensor= p_tensor.view(-1,4)
print(p_tensor)
entropy2 = Categorical(probs = p_tensor).entropy()
print(entropy2.shape)
print(entropy2)


p_tensor = torch.Tensor([0.7, 0.1, 0.1, 0.1])
entropy2 = Categorical(probs = p_tensor).entropy()
print(entropy2)
Exemplo n.º 7
0
render_rate = 100 # render every render_rate episodes
while True:
    rewards = []
    actions = []
    states  = []
    # reset environment
    state = env.reset()
    while True:
        # render episode every render_rate epsiodes
        if n_episode%render_rate==0:
            env.render()

        # calculate probabilities of taking each action
        probs = policy(torch.tensor(state).unsqueeze(0).float())
        # sample an action from that set of probs
        sampler = Categorical(probs)
        action = sampler.sample()

        # use that action in the environment
        new_state, reward, done, info = env.step(action.item())
        # store state, action and reward
        states.append(state)
        actions.append(action)
        rewards.append(reward)

        state = new_state
        if done:
            break

    # preprocess rewards
    rewards = np.array(rewards)
Exemplo n.º 8
0
    def monte_carlo_expansion(self, action, decoder_hidden, encoder_outputs,
                              full_input_variable_batch, initial_sequence,
                              max_sample_length, batch_size):
        # Prepare next decoder input with UNK
        unk_check_time_start = time.time()
        action = action.data
        action = where(action < self.MONTE_CARLO_UPPER_BOUND, action,
                       self.MONTE_CARLO_MASK)
        decoder_input = Variable(action.unsqueeze(1))
        timings[timings_var_unk_check] += time.time() - unk_check_time_start

        monte_carlo_cat_time_start_2 = time.time()
        if initial_sequence is not None:
            start = len(initial_sequence.data[0]) + 1
            decoder_output_variables = torch.cat(
                (initial_sequence, decoder_input), 1)
        else:
            start = 1
            decoder_output_variables = decoder_input
        timings[timings_var_monte_carlo_cat] += (time.time() -
                                                 monte_carlo_cat_time_start_2)

        for di in range(start, max_sample_length):
            monte_carlo_inner_time_start = time.time()
            decoder_output, decoder_hidden, _ \
                = self.decoder(decoder_input, decoder_hidden, encoder_outputs, full_input_variable_batch,
                               batch_size)
            timings[timings_var_monte_carlo_inner] += (
                time.time() - monte_carlo_inner_time_start)

            before_topk_monte = time.time()
            m = Categorical(decoder_output)
            action = m.sample()
            timings[timings_var_monte_carlo_top1] += (time.time() -
                                                      before_topk_monte)

            unk_check_time_start = time.time()
            action = action.data
            action = where(action < self.MONTE_CARLO_UPPER_BOUND, action,
                           self.MONTE_CARLO_MASK)
            decoder_input = Variable(action.unsqueeze(1))
            timings[timings_var_unk_check] += time.time(
            ) - unk_check_time_start

            monte_carlo_cat_time_start = time.time()
            decoder_output_variables = torch.cat(
                (decoder_output_variables, decoder_input), 1)
            timings[timings_var_monte_carlo_cat] += (
                time.time() - monte_carlo_cat_time_start)

        # Add EOS to the samples that did not produce EOS, and add PAD to rest
        unk_check_time_start = time.time()
        last_tokens = where(action > self.EOS_MATRIX_MONTE_CARLO,
                            self.EOS_MATRIX_MONTE_CARLO,
                            self.PAD_MATRIX_MONTE_CARLO)
        timings[timings_var_unk_check] += time.time() - unk_check_time_start
        monte_carlo_cat_time_start = time.time()
        decoder_output_variables = torch.cat(
            (decoder_output_variables, Variable(last_tokens.unsqueeze(1))), 1)
        timings[timings_var_monte_carlo_cat] += (time.time() -
                                                 monte_carlo_cat_time_start)

        return decoder_output_variables
Exemplo n.º 9
0
    def train_on_batch(self, input_variable_batch, full_input_variable_batch,
                       input_lengths, full_target_variable_batch,
                       target_lengths, discriminator, max_monte_carlo_length,
                       target_variable, extended_vocabs,
                       full_target_variable_batch_2):

        self.encoder_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()
        max_target_length = max(target_lengths)

        init_encoder_time_start = time.time()
        encoder_outputs, encoder_hidden = self.encoder(input_variable_batch,
                                                       input_lengths, None)
        encoder_hidden = concat_encoder_hidden_directions(encoder_hidden)
        timings[timings_var_init_encoder] += (time.time() -
                                              init_encoder_time_start)

        # Argmax baseline
        if not self.use_running_avg_baseline:
            baseline = self.get_argmax_baseline(
                encoder_hidden, encoder_outputs, max_target_length,
                full_input_variable_batch, discriminator,
                full_target_variable_batch_2, extended_vocabs)
            print_baseline = baseline.sum(0).item() / self.batch_size

        # MLE loss
        if self.beta < 1.00:
            mle_loss = self.get_teacher_forcing_mle(
                encoder_hidden, encoder_outputs, max_target_length,
                full_input_variable_batch, full_target_variable_batch,
                target_variable)

        decoder_input = Variable(
            torch.LongTensor([SOS_token] * self.batch_size))
        decoder_input = decoder_input.cuda(
        ) if self.use_cuda else decoder_input
        decoder_hidden = encoder_hidden

        full_policy_values = []
        full_sequence_rewards = []
        accumulated_sequence = None

        policy_iteration_time_start = time.time()

        monte_carlo_length = min(max_target_length, max_monte_carlo_length)
        num_samples = 0

        eos_check_start = monte_carlo_length - 10

        multiply_time_2 = time.time()
        encoder_outputs_temp = multiply_data_in_dim(
            encoder_outputs, self.num_monte_carlo_samples, dim=1)
        full_target_variable_batch_2 = full_target_variable_batch_2 * self.num_monte_carlo_samples
        full_input_variable_batch_temp = multiply_data_in_dim(
            full_input_variable_batch, self.num_monte_carlo_samples, dim=0)
        timings[timings_var_copy_params] += time.time() - multiply_time_2

        # Policy iteration
        for di in range(monte_carlo_length):
            decoder_output, decoder_hidden, decoder_attention \
                = self.decoder(decoder_input, decoder_hidden, encoder_outputs, full_input_variable_batch,
                               self.batch_size)

            # Sample things
            # Currently always sampling the first token (to make sure there is at least 1 sampling per batch)
            sampling = True if random.random() <= self.sample_rate else False
            if sampling or di == 0:
                num_samples += 1
                m = Categorical(decoder_output)
                action = m.sample()
                log_prob = m.log_prob(action)
                full_policy_values.append(log_prob)

                monte_carlo_time_start = time.time()
                # Multiply the batch size by monte_carlo_samples
                multiply_time = time.time()
                # NOTE: Not sure if we need to do .clone() here, but just to be safe.
                action_temp = multiply_data_in_dim(
                    action.clone(), self.num_monte_carlo_samples, dim=0)
                decoder_hidden_temp = multiply_data_in_dim(
                    decoder_hidden, self.num_monte_carlo_samples, dim=1)
                if di == 0:
                    accumulated_sequence_temp = None
                else:
                    accumulated_sequence_temp = multiply_data_in_dim(
                        accumulated_sequence,
                        self.num_monte_carlo_samples,
                        dim=0)
                batch_size_temp = self.batch_size * self.num_monte_carlo_samples
                timings[timings_var_copy_params] += time.time() - multiply_time

                sample_multiplied \
                    = self.monte_carlo_expansion(action_temp, decoder_hidden_temp, encoder_outputs_temp,
                                                 full_input_variable_batch_temp, accumulated_sequence_temp,
                                                 monte_carlo_length, batch_size_temp)

                monte_carlo_outer_time_start = time.time()
                current_reward, gan_reward, rouge_reward = discriminator.evaluate(
                    sample_multiplied, full_target_variable_batch_2, None)
                current_reward_chunked = current_reward.chunk(
                    self.num_monte_carlo_samples, dim=0)
                temp_reward = 0
                for i in range(0, self.num_monte_carlo_samples):
                    temp_reward += current_reward_chunked[i]

                # calculate average reward
                avg_reward = temp_reward / self.num_monte_carlo_samples
                full_sequence_rewards.append(avg_reward)

                # add cumulative reward to calculate running average baseline
                if self.use_running_avg_baseline:
                    self.cumulative_reward += avg_reward.sum(
                        0) / self.batch_size
                    self.updates += 1
                timings[timings_var_monte_carlo_outer] += (
                    time.time() - monte_carlo_outer_time_start)
                timings[timings_var_monte_carlo] += (time.time() -
                                                     monte_carlo_time_start)

            # Get top1 for the next input to the decoder
            # topv, topi = decoder_output.data.topk(1)
            # ni = topi
            # ni = ni.squeeze(1)

            # sample next action
            m = Categorical(decoder_output)
            action = m.sample()
            ni = action.data

            # Check for EOS so that we stop sampling if all are EOS or PAD
            if di > eos_check_start:
                if is_whole_batch_pad_or_eos_squeezed(ni):
                    break

            # Remove UNK before setting next input to decoder
            unk_check_time_start = time.time()
            ni = where(ni < self.UPPER_BOUND, ni, self.MASK)
            decoder_input = Variable(ni.unsqueeze(1))
            timings[timings_var_unk_check] += time.time(
            ) - unk_check_time_start

            if accumulated_sequence is None:
                accumulated_sequence = decoder_input
            else:
                accumulated_sequence = torch.cat(
                    (accumulated_sequence, decoder_input), 1)

        if self.use_running_avg_baseline:
            # Calculate running average baseline
            baseline = self.cumulative_reward / self.updates
            print_baseline = baseline.item()

        policy_loss = 0
        total_print_reward = 0
        total_print_adjusted_reward = 0

        print_log_sum = 0
        for i in range(0, len(full_policy_values)):
            try:
                print_log_sum += torch.sum(full_policy_values[i])
                total_print_reward += torch.sum(full_sequence_rewards[i])
                adjusted_full_sequence_reward = full_sequence_rewards[
                    i] - baseline
                if not self.allow_negative_rewards:
                    for j in range(0, len(adjusted_full_sequence_reward.data)):
                        if adjusted_full_sequence_reward.data[j] < 0.0:
                            adjusted_full_sequence_reward.data[j] = 0.0
                total_print_adjusted_reward += torch.sum(
                    adjusted_full_sequence_reward)
                loss = -full_policy_values[i] * adjusted_full_sequence_reward
                policy_loss += torch.sum(loss) / self.batch_size
            except RuntimeError as e:
                log_message(e)
                log_message("Runtime error while updating print log sum")
                num_samples -= 1

        print_log_sum = print_log_sum / self.batch_size
        total_print_reward = total_print_reward / self.batch_size
        total_print_reward = total_print_reward / num_samples
        total_print_adjusted_reward = total_print_adjusted_reward / self.batch_size
        total_print_adjusted_reward = total_print_adjusted_reward / num_samples

        timings[timings_var_policy_iteration] += (time.time() -
                                                  policy_iteration_time_start)

        backprop_time_start = time.time()

        # divide by sequence length
        policy_loss = policy_loss / num_samples
        if self.beta < 1.00:
            mle_loss = mle_loss / max_target_length
            total_loss = self.beta * policy_loss + (1 - self.beta) * mle_loss
        else:
            total_loss = policy_loss

        total_loss.backward()

        clip = 2
        torch.nn.utils.clip_grad_norm_(self.encoder.parameters(), clip)
        torch.nn.utils.clip_grad_norm_(self.decoder.parameters(), clip)

        self.encoder_optimizer.step()
        self.decoder_optimizer.step()

        timings[timings_var_backprop] += (time.time() - backprop_time_start)

        total_print_reward_data = total_print_reward.item()

        if self.beta < 1.00:
            return total_loss.item(), mle_loss.item(), policy_loss.item(), print_log_sum.item(), \
                   total_print_reward_data, print_baseline, total_print_adjusted_reward.item(), \
                   total_print_reward_data, total_print_reward_data
        else:
            return total_loss.item(), total_loss.item(), policy_loss.item(), print_log_sum.item(), \
                   total_print_reward_data, print_baseline, total_print_adjusted_reward.item(), \
                   total_print_reward_data, total_print_reward_data
Exemplo n.º 10
0
 def get_log_prob(self, state, action):
     # not volatile
     action_dist = self.forward(state)
     m = Categorical(action_dist)
     log_prob = m.log_prob(action)
     return log_prob
Exemplo n.º 11
0
 def choose_action(self, state):
     state = torch.unsqueeze(torch.FloatTensor(state), 0)
     action_values = self.policy(state)
     m = Categorical(action_values)
     action = m.sample()
     return action
Exemplo n.º 12
0
 def select_action(self, state):
     # volatile
     action_dist = self.forward(state)
     m = Categorical(action_dist)
     action = m.sample()
     return action.data
Exemplo n.º 13
0
def actor(rank, args, T, memory_queue, model_queue, p2):
    torch.manual_seed(args.seed + rank)
    # env = FrameStack(gym.make(args.env), 4)
    env = gym.make(args.env,
                   java_env_path="..",
                   port=args.port + rank * 2,
                   p2=p2)
    print("Process {} fighting with {}".format(rank, p2))
    env.seed(args.seed + rank)
    model = ActorCritic(env.observation_space.shape[0], env.action_space,
                        args.hidden_size)
    n_epi = 0

    # Actor Loop
    while T.value() <= args.T_max:
        t_value = T.value()
        try:
            with timeout(seconds=30):
                s = env.reset(p2=p2)
                # opp_s = flip_obs(s)
        except TimeoutError:
            print("Time out to reset env")
            env.close()
            continue
        if not model_queue.empty():
            print("Process {} going to load new model at EPISODE {}......".
                  format(rank, t_value))
            received_obj = model_queue.get()
            model_dict = copy.deepcopy(received_obj)
            model.load_state_dict(model_dict)
            print("Process {} finished loading new mode at EPISODE {}!!!!!!".
                  format(rank, t_value))
            del received_obj
        action_mask = [False for _ in range(env.action_space.n)]
        action_mask = torch.BoolTensor(action_mask)
        done = False
        discard = False
        round_score = 0
        episode_length = 0
        sum_entropy = 0
        seq_data = []
        while not done:
            env.render()
            prob = model.pi(torch.from_numpy(s).float(), action_mask)
            sum_entropy += Categorical(probs=prob.detach()).entropy()
            a = Categorical(prob.detach()).sample().item()
            s_prime, r, done, info = env.step(a)
            # (opp_s_prime, opp_r, opp_done, _) = info.get('opp_transit', False)
            # opp_a = obs_get_action(opp_s_prime)
            if info.get('no_data_receive', False):
                env.close()
                discard = True
                break
            valid_actions = info.get('my_action_enough', {})
            # get valid actions
            if len(valid_actions) > 0:
                action_mask = [
                    False if i in valid_actions else True for i in range(56)
                ]
            else:
                action_mask = [False for _ in range(env.action_space.n)]
            action_mask = torch.BoolTensor(action_mask)
            seq_data.append((s, a, r, prob.detach().numpy(), done,
                             action_mask.detach().numpy()))
            round_score += r
            s = s_prime
            episode_length += 1
        if not discard:
            n_epi += 1
            on_policy_data = (seq_data, (episode_length, round_score,
                                         sum_entropy / episode_length))
            print(
                "Process: {}, # of episode :{}, round score : {}, episode_length: {}"
                .format(rank, n_epi, round_score, episode_length))
            send_object = copy.deepcopy(on_policy_data)
            memory_queue.put(send_object, )
            print("Process {} send trajectory".format(rank))
    env.close()
 def get_action(self, state):
     probs, state_value = self.policynetwork(state)
     m = Categorical(probs)
     action = m.sample()
     self.policynetwork.saved_actions.append(SavedAction(m.log_prob(action), state_value))
     return action.data[0]
Exemplo n.º 15
0
    def _forward(self, x, t, temp, mask):
        # Transform markers and timesteps into the embedding spaces
        phi_x, phi_t = self.embed_x(x), self.embed_t(t)
        phi_xt = torch.cat([phi_x, phi_t], dim=-1)
        T, BS, _ = phi_x.shape

        ##Compute h_t Shape T+1, BS, dim
        # Run RNN over the concatenated embedded sequence
        h_0 = torch.zeros(1, BS, self.rnn_hidden_dim).to(device)
        # Run RNN
        hidden_seq, _ = self.rnn(phi_xt, h_0)
        # Append h_0 to h_1 .. h_T
        hidden_seq = torch.cat([h_0, hidden_seq], dim=0)

        ## Inference a_t= q([x_t, h_t], a_{t+1})
        # Get the sampled value and (mean + var) latent variable
        # using the hidden state sequence
        # posterior_sample_y, posterior_sample_z, posterior_logits_y, (posterior_mu_z, posterior_logvar_z) = self.encoder(phi_xt, hidden_seq[:-1, :,:], temp, mask)
        (posterior_sample_y,
         posterior_dist_y), (posterior_sample_z,
                             posterior_dist_z) = self.encoder(
                                 phi_xt, hidden_seq[:-1, :, :], temp, mask)

        # Create distributions for Posterior random vars
        # posterior_dist_z = Normal(posterior_mu_z, torch.exp(posterior_logvar_z*0.5))
        # posterior_dist_y = Categorical(logits=posterior_logits_y)

        # Prior is just a Normal(0,1) dist for z and Uniform Categorical for y
        #prior dist z is TxBSx latent_dim. T=0=> Normal(0,1)
        prior_mu, prior_logvar = self.prior(posterior_sample_z)  ##Normal(0, 1)
        prior_dist_z = Normal(prior_mu, (prior_logvar * 0.5).exp())

        prior_dist_y = Categorical(
            probs=1. / self.cluster_dim *
            torch.ones(1, BS, self.cluster_dim).to(device))

        ## Generative Part

        # Use the embedded markers and times to create another set of
        # hidden vectors. Can reuse the h_0 and time_marker combined computed above

        # Combine (z_t, h_t, y) form the input for the generative part
        concat_hzy = torch.cat(
            [hidden_seq[:-1], posterior_sample_z, posterior_sample_y], dim=-1)
        # phi_hzy = self.gen_pre_module(concat_hzy)
        # mu_marker, logvar_marker = generate_marker(self, phi_hzy, None)
        # time_log_likelihood, mu_time = compute_point_log_likelihood(self, phi_hzy, t)
        # marker_log_likelihood = compute_marker_log_likelihood(self, x, mu_marker, logvar_marker)

        phi_hzy = self.decoder(concat_hzy)

        dist_marker_recon = self.decoder.generate_marker(phi_hzy, t)
        time_log_likelihood, mu_time = self.decoder.compute_time_log_prob(
            phi_hzy, t)  #(T,BS)
        marker_log_likelihood = self.decoder.compute_marker_log_prob(
            x, dist_marker_recon)  #(T,BS)

        KL_cluster = kl_divergence(posterior_dist_y, prior_dist_y)
        KL_z = kl_divergence(posterior_dist_z, prior_dist_z).sum(-1) * mask
        KL = KL_cluster.sum() + KL_z.sum()
        try:
            assert (KL >= 0)
        except:
            raise ValueError("KL should be non-negative")

        metric_dict = {"z_cluster": 0}
        with torch.no_grad():
            # Metric not needed
            """
            if self.time_loss == 'intensity':
                mu_time = compute_time_expectation(self, phi_hzy, t, mask)[:,:, None]
            get_marker_metric(self.marker_type, mu_marker, x, mask, metric_dict)
            get_time_metric(mu_time,  t, mask, metric_dict)
            """
            metric_dict['marker_acc'] = -1.
            metric_dict['marker_acc_count'] = 1.
            metric_dict['time_mse'] = 1.
            metric_dict['time_mse_count'] = 1.

        return time_log_likelihood, marker_log_likelihood, KL, metric_dict
 def select_action(self, state):
     probs = self.forward(state)
     m = Categorical(probs)
     action = m.sample()
     self.saved_log_probs.append(m.log_prob(action))
     return probs, action.data[0]
Exemplo n.º 17
0
def main():

    DIR = args.DIR
    embedding_file = args.embedding_dir
    embedding_matrix = numpy.load(embedding_file)

    best_network_file = "./model/network_model_pretrain.best.top"
    print >> sys.stderr, "Read model from ", best_network_file
    best_network_model = torch.load(best_network_file)

    manager = network.Network(
        nnargs["pair_feature_dimention"], nnargs["mention_feature_dimention"],
        nnargs["word_embedding_dimention"], nnargs["span_dimention"], 1000,
        nnargs["embedding_size"], nnargs["embedding_dimention"],
        embedding_matrix).cuda()
    net_copy(manager, best_network_model)

    reduced = ""
    if args.reduced == 1:
        reduced = "_reduced"

    print >> sys.stderr, "prepare data for train ..."
    train_docs_iter = DataReader.DataGnerater("train" + reduced)
    #train_docs_iter = DataReader.DataGnerater("dev"+reduced)
    print >> sys.stderr, "prepare data for dev and test ..."
    dev_docs_iter = DataReader.DataGnerater("dev" + reduced)
    test_docs_iter = DataReader.DataGnerater("test" + reduced)
    '''
    print "Performance after pretraining..."
    print "DEV"
    metric = performance.performance(dev_docs_iter,worker,manager) 
    print "Average:",metric["average"]
    print "TEST"
    metric = performance.performance(test_docs_iter,worker,manager) 
    print "Average:",metric["average"]
    print "***"
    print
    sys.stdout.flush()
    '''

    lr = nnargs["lr"]
    top_k = nnargs["top_k"] * 20

    model_save_dir = "./model/reinforce/"
    utils.mkdir(model_save_dir)

    score_softmax = nn.Softmax()

    optimizer_manager = optim.RMSprop(manager.parameters(), lr=lr, eps=1e-6)

    MAX_AVE = 2048

    for echo in range(nnargs["epoch"]):

        start_time = timeit.default_timer()
        print "Pretrain Epoch:", echo

        reward_log = Logger(Tensorboard + args.tb +
                            "/acl2018/%d/reward/" % echo,
                            flush_secs=3)
        entropy_log_manager = Logger(Tensorboard + args.tb +
                                     "/acl2018/%d/entropy/manager" % echo,
                                     flush_secs=3)
        entropy_log_worker = Logger(Tensorboard + args.tb +
                                    "/acl2018/%d/entropy/worker" % echo,
                                    flush_secs=3)

        train_docs = utils.load_pickle(args.DOCUMENT + 'train_docs.pkl')
        #train_docs = utils.load_pickle(args.DOCUMENT + 'dev_docs.pkl')
        docs_by_id = {doc.did: doc for doc in train_docs}

        ave_reward = []
        ave_manager_entropy = []
        ave_worker_entropy = []

        print >> sys.stderr, "Link docs ..."
        tmp_data = []
        cluster_info = {0: [0]}
        cluster_list = [0]
        current_new_cluster = 1
        predict_action_embedding = []
        choose_action = []
        mid = 1

        path = []

        step = 0

        statistic = {
            "worker_hits": 0,
            "manager_hits": 0,
            "total": 0,
            "manager_predict_last": 0,
            "worker_predict_last": 0
        }

        for data in train_docs_iter.rl_case_generater(shuffle=True):

            rl = data["rl"]

            scores_manager, representations_manager = get_score_representations(
                manager, data)

            doc = docs_by_id[rl["did"]]

            for s, e in zip(rl["starts"], rl["ends"]):
                action_embeddings = representations_manager[s:e]

                #score = score_softmax(torch.transpose(scores_manager[s:e],0,1)).data.cpu().numpy()[0]
                #index = utils.choose_action(score)

                probs = score_softmax(
                    torch.transpose(scores_manager[s:e], 0, 1))
                m = Categorical(probs)
                this_action = m.sample()
                index = this_action.data.cpu().numpy()[0]

                if index == (e - s - 1):
                    should_cluster = current_new_cluster
                    cluster_info[should_cluster] = []
                    current_new_cluster += 1
                else:
                    should_cluster = cluster_list[index]

                choose_action.append(index)
                cluster_info[should_cluster].append(mid)
                cluster_list.append(should_cluster)
                mid += 1

                link = index
                m1, m2 = rl['ids'][s + link]
                doc.link(m1, m2)

                path.append(index)

                #cluster_indexs = torch.cuda.LongTensor(cluster_info[should_cluster])
                #action_embedding_predict = torch.mean(action_embeddings[cluster_indexs],0,keepdim=True)
                #predict_action_embedding.append(action_embedding_predict)

            tmp_data.append(data)

            if rl["end"] == True:
                inside_index = 0
                manager_entropy = 0.0
                for data in tmp_data:
                    new_step = step
                    rl = data["rl"]
                    reward = doc.get_f1()

                    ave_reward.append(reward)
                    if len(ave_reward) >= MAX_AVE:
                        ave_reward = ave_reward[1:]
                    reward_log.log_value(
                        'reward',
                        float(sum(ave_reward)) / float(len(ave_reward)),
                        new_step)

                    scores_manager, representations_manager = get_score_representations(
                        manager, data, dropout=nnargs["dropout_rate"])
                    doc_weight = (len(doc.mention_to_gold) +
                                  len(doc.mentions)) / 10.0
                    baselines = []
                    for s, e in zip(rl["starts"], rl["ends"]):
                        score = score_softmax(
                            torch.transpose(scores_manager[s:e], 0, 1))

                        ids = rl['ids'][s:e]
                        ana = ids[0, 1]
                        old_ant = doc.ana_to_ant[ana]
                        doc.unlink(ana)
                        costs = rl['costs'][s:e]
                        for ant_ind in range(e - s):
                            costs[ant_ind] = doc.link(ids[ant_ind, 0],
                                                      ana,
                                                      hypothetical=True,
                                                      beta=1)
                        doc.link(old_ant, ana)
                        #costs -= costs.max()
                        #costs *= -doc_weight
                        costs = autograd.Variable(
                            torch.from_numpy(costs).type(
                                torch.cuda.FloatTensor))

                        if not score.size()[1] == costs.size()[0]:
                            #print "score.size not comparable with costs.size at",rl["did"]
                            continue
                        #baseline = torch.sum(score*costs)
                        baseline = torch.sum(score *
                                             costs).data.cpu().numpy()[0]
                        baselines.append(baseline)

                    optimizer_manager.zero_grad
                    manager_loss = None

                    for s, e, i in zip(rl["starts"], rl["ends"],
                                       range(len(rl["ends"]))):
                        baseline = baselines[i]
                        action = path[inside_index]
                        score = torch.squeeze(
                            score_softmax(
                                torch.transpose(scores_manager[s:e], 0, 1)))
                        this_cost = torch.log(
                            score[action]) * -1.0 * (reward - baseline)

                        if manager_loss is None:
                            manager_loss = this_cost
                        else:
                            manager_loss += this_cost
                        manager_entropy += torch.sum(
                            score *
                            torch.log(score + 1e-7)).data.cpu().numpy()[0]
                        inside_index += 1

                    manager_loss.backward()
                    torch.nn.utils.clip_grad_norm(manager.parameters(),
                                                  nnargs["clip"])
                    optimizer_manager.step()

                    ave_manager_entropy.append(manager_entropy)
                    if len(ave_manager_entropy) >= MAX_AVE:
                        ave_manager_entropy = ave_manager_entropy[1:]
                    entropy_log_manager.log_value(
                        'entropy',
                        float(sum(ave_manager_entropy)) /
                        float(len(ave_manager_entropy)), new_step)

                    new_step += 1

                step = new_step
                tmp_data = []
                cluster_info = {0: [0]}
                cluster_list = [0]
                current_new_cluster = 1
                mid = 1
                predict_action_embedding = []
                choose_action = []
                path = []

        end_time = timeit.default_timer()
        print >> sys.stderr, "TRAINING Use %.3f seconds" % (end_time -
                                                            start_time)
        print >> sys.stderr, "save model ..."
        #print "Top k",top_k
        print "Worker Hits", statistic[
            "worker_hits"], "Manager Hits", statistic[
                "manager_hits"], "Total", statistic["total"]
        print "Worker predict last", statistic[
            "worker_predict_last"], "Manager predict last", statistic[
                "manager_predict_last"]
        #torch.save(network_model, model_save_dir+"network_model_rl_worker.%d"%echo)
        #torch.save(ana_network, model_save_dir+"network_model_rl_manager.%d"%echo)

        print "DEV"
        #metric = performance.performance(dev_docs_iter,worker,manager)
        metric = performance.performance(dev_docs_iter, manager)
        print "Average:", metric["average"]
        print "TEST"
        metric = performance.performance(test_docs_iter, manager)
        print "Average:", metric["average"]
        print
        sys.stdout.flush()
Exemplo n.º 18
0
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        sum_log_probs = 0
        values = []
        sum_rewards = 0
        entropy = 0
        state = envs.reset(_i)
        emb = encoder(state)
        first = True
        while True:
            if first:
                first = False
                prob, h, c = decoder(emb, envs.masks().to(device))
                dist = Categorical(prob)
                actions = dist.sample()
                log_prob = dist.log_prob(actions)
            else:
                prob, h, c = decoder(emb, masks, last_actions, h, c)
                dist = Categorical(prob)
                actions = dist.sample()
                log_prob = dist.log_prob(actions)

            last_actions = actions
            rewards, dones, masks, all_done = envs.step(actions.cpu().detach().numpy())
            if all_done:
                break

            sum_log_probs += log_prob * (1-dones)
            sum_rewards += rewards * (1-dones)
Exemplo n.º 19
0
 def select_action(self, state):
     probs, state_value = self.model( torch.from_numpy(state).float() )
     m = Categorical(probs) # I don't know what this does
     action = m.sample()
     self.model.saved_actions.append(SavedAction(m.log_prob(action), state_value))
     return action.item()
Exemplo n.º 20
0
    def choose_action(self, x):
        x = x[np.newaxis, :]
        probs, value = self.forward(x)
        dist = Categorical(probs)

        return dist, value
Exemplo n.º 21
0
    def forward(self, states):
        logits = self.actor_net(states)
        pi = Categorical(logits=logits)
        actions = pi.sample()

        return pi, actions
 def evaluate_action(self, state, action):
     probs = self.forward(state)
     action_distribution = Categorical(probs=probs)
     log_prob = action_distribution.log_prob(action)
     entropy = action_distribution.entropy().mean()
     return action_distribution.probs, log_prob, entropy
 def entropy(self, datas):
     distribution = Categorical(datas)
     return distribution.entropy().float().to(device)
Exemplo n.º 24
0
 def forward(self, x):
     x = self.decoder(x)
     B, _, H, W = x.size()
     x = x.view(B, 3, 256, H, W).permute(0, 1, 3, 4, 2)
     dist = Categorical(logits=x)
     return dist
Exemplo n.º 25
0
 def forward(self, x):
     value = self.critic(x)
     print(x)
     probs = self.actor(x.unsqueeze(0)).squeeze()
     dist = Categorical(probs)
     return dist, value
Exemplo n.º 26
0
 def act(self, state):
     state = torch.from_numpy(state).float().unsqueeze(0).to(device)
     probs = self.policyNetwork.forward(state).cpu()
     m = Categorical(probs)
     action = m.sample()
     return action.item(), m.log_prob(action)
Exemplo n.º 27
0
 def choose_action(self, s):
     prob = self.pi(
         torch.from_numpy(s).unsqueeze(0).float().to(device)).squeeze()
     m = Categorical(prob)
     a = m.sample().item()
     return a, prob
Exemplo n.º 28
0
env = pennies_game()

for t_eps in range(num_episode):
    mat_action = []

    mat_state1 = []
    mat_reward1 = []
    mat_done = []

    mat_state2 = []
    mat_reward2 = []
    state, _, _, _, _ = env.reset()
    #data_collection
    for i in range(batch_size):
        pi1 = p1()
        dist1 = Categorical(pi1)
        action1 = dist1.sample()

        pi2 = p2()
        dist2 = Categorical(pi2)
        action2 = dist2.sample()
        action = np.array([action1, action2])

        state = np.array([0,0])
        mat_state1.append(torch.FloatTensor(state))
        mat_state2.append(torch.FloatTensor(state))
        mat_action.append(torch.FloatTensor(action))
        #print(action)

        state, reward1, reward2, done, _ = env.step(action)
Exemplo n.º 29
0
 def forecast(self, theta):
     return Categorical(logits=theta)
Exemplo n.º 30
0
def a2c(env):

    num_inputs = 128
    num_outputs = env.action_space.n
    print(num_outputs)

    model = ActorCritic(num_inputs, num_outputs)
    ac_optimizer = optim.Adam(model.parameters(), lr=0.001)

    all_lengths = []
    average_lengths = []
    ep_rewards = []
    entropy_term = 0

    for episode in range(MAX_EPISODES):

        log_probs = []
        rewards = []
        state_values = []

        state = env.reset()
        steps = 0
        while True:
            state_value, probs_dist = model.forward(state)

            #Selecting action randomly from dist given
            m = Categorical(probs_dist)
            action = m.sample()
            log_prob = torch.log(probs_dist.squeeze(0)[action])
            log_probs.append(log_prob)
            new_state, reward, done, _ = env.step(action)

            rewards.append(reward)
            state_values.append(state_value)
            state = new_state

            steps += 1
            if done:
                Qval, _ = model.forward(new_state)
                Qval = Qval.detach().numpy()[0, 0]
                print(np.sum(rewards))
                ep_rewards.append(np.sum(rewards))
                all_lengths.append(steps)
                average_lengths.append(np.mean(all_lengths[-10:]))
                if episode % 10 == 0:
                    print(
                        "episode: {}, reward: {}, total length: {}, average length: {} \n"
                        .format(episode, np.sum(rewards), steps,
                                average_lengths[-1]))
                plot_durations(ep_rewards)
                break

        # compute Q values
        Qvals = np.zeros_like(state_values)
        for t in reversed(range(len(rewards))):
            Qval = rewards[t] + GAMMA * Qval
            Qvals[t] = Qval

        # update actor critic
        state_values = torch.FloatTensor(state_values)
        Qvals = torch.FloatTensor(list(Qvals))
        log_probs = torch.stack(log_probs)

        advantage = Qvals - state_values
        actor_loss = (-log_probs * advantage).mean()
        critic_loss = 0.5 * advantage.pow(2).mean()
        ac_loss = actor_loss + critic_loss + 0.001 * entropy_term

        ac_optimizer.zero_grad()
        ac_loss.backward()
        ac_optimizer.step()