예제 #1
0
def gumbel_softmax(logits: FT,
                   temperature: float,
                   num_samples: Optional[int] = None) -> Tuple[FT, FT, LT]:
    """Sample from the Gumbel-Softmax distribution and optionally discretize."""
    logits = logits.align_to('batch', 'length', 'label')
    y = gumbel_softmax_sample(logits, temperature, num_samples)
    y = y.align_to('batch', 'length', 'label', ...)
    max_values, max_inds = y.max(dim='label')
    y_one_hot = (max_values.align_as(y) == y).float()
    y_one_hot = (y_one_hot - y).detach() + y
    bi = get_named_range(logits.size('batch'), 'batch').align_as(max_inds)
    li = get_named_range(logits.size('length'), 'length').align_as(max_inds)
    if num_samples is None:
        with NoName(max_inds, y_one_hot, bi, li):
            probs = y_one_hot[bi, li, max_inds]
        probs.rename_('batch', 'length')
    else:
        si = get_named_range(max_inds.size('sample'),
                             'sample').align_as(max_inds)
        with NoName(max_inds, y_one_hot, bi, li, si):
            probs = y_one_hot[bi, li, max_inds, si]
        probs.rename_('batch', 'length', 'sample')
    seq_probs = (1e-8 + probs).log().sum(dim='length').exp()

    return y, y_one_hot, max_inds, seq_probs
예제 #2
0
    def search(self,
               sot_id: int,
               src_emb: FT,
               src_outputs: FT,
               src_paddings: BT,
               src_lengths: LT,
               beam_size: int,
               lang_emb: Optional[FT] = None) -> Hypotheses:
        if beam_size <= 0:
            raise ValueError(f'`beam_size` must be positive.')

        batch_size = src_emb.size('batch')
        tokens = torch.full([batch_size, beam_size], sot_id,
                            dtype=torch.long).to(src_emb.device).rename(
                                'batch', 'beam')
        accum_scores = torch.full_like(tokens, -9999.9).float()
        accum_scores[:, 0] = 0.0
        init_att = None
        if g.input_feeding:
            init_att = get_zeros(batch_size, beam_size,
                                 g.hidden_size).rename('batch', 'beam',
                                                       'hidden')
        lstm_state = LstmStatesByLayers.zero_state(
            self.cell.num_layers,
            batch_size,
            beam_size,
            self.attn.input_tgt_size,
            bidirectional=False,
            names=['batch', 'beam', 'hidden'])

        def expand_beam(orig, collapse: bool = True):
            if collapse:
                return torch.repeat_interleave(orig, beam_size, dim='batch')
            else:
                return duplicate(orig, 'batch', beam_size, 'beam')

        src_emb = expand_beam(src_emb)
        src_outputs = expand_beam(src_outputs)
        src_paddings = expand_beam(src_paddings)
        max_lengths = (src_lengths.float() * 1.5).long()
        max_lengths = expand_beam(max_lengths, collapse=False)
        constants = BeamConstant(src_emb,
                                 src_outputs,
                                 src_paddings,
                                 max_lengths,
                                 lang_emb=lang_emb)
        init_beam = Beam(0,
                         accum_scores,
                         tokens,
                         lstm_state,
                         constants,
                         prev_att=init_att)
        hyps = super().search(init_beam)
        return hyps
예제 #3
0
 def extend(self, label_log_probs: FT):
     num_labels = label_log_probs.size('label')
     label_log_probs = label_log_probs.align_to('batch', 'beam', 'label')
     new_hyp_log_probs = self.hyp_log_probs[-1].align_to(
         'batch', 'beam', 'label') + label_log_probs
     new_hyp_log_probs = new_hyp_log_probs.flatten(['beam', 'label'],
                                                   'beam_X_label')
     top_values, top_inds = torch.topk(new_hyp_log_probs, g.beam_size,
                                       'beam_X_label')
     beam_ids = top_inds // num_labels
     label_ids = top_inds % num_labels
     self.beam_ids.append(beam_ids.rename(beam_X_label='beam'))
     self.hyps.append(label_ids.rename(beam_X_label='beam'))
     self.hyp_log_probs.append(top_values.rename(beam_X_label='beam'))
예제 #4
0
 def search_by_probs(self, lengths: LT,
                     label_log_probs: FT) -> Tuple[LT, FT]:
     max_length = lengths.max().item()
     bs = label_log_probs.size('batch')
     label_log_probs = label_log_probs.align_to('length', 'batch', 'label')
     beam = Beam(bs)
     for step in range(max_length):
         __label_log_probs = label_log_probs[step]
         # __lengths = lengths[step]
         within_length = (step < lengths).align_as(
             __label_log_probs)  # __lengths
         beam.extend(__label_log_probs * within_length.float())
     beam.finish_search(lengths)
     samples = beam.samples.rename(beam='sample')
     sample_log_probs = beam.sample_log_probs.rename(beam='sample')
     return samples, sample_log_probs
예제 #5
0
 def search_by_probs(self, lengths: LT,
                     label_log_probs: FT) -> Tuple[LT, FT]:
     max_length = lengths.max().item()
     samples = get_tensor(
         torch.LongTensor(list(product([B, I, O], repeat=max_length))))
     samples.rename_('sample', 'length')
     bs = label_log_probs.size('batch')
     samples = samples.align_to('batch', 'sample',
                                'length').expand(bs, -1, -1)
     sample_log_probs = label_log_probs.gather('label', samples)
     with NoName(lengths):
         length_mask = get_length_mask(lengths, max_length).rename(
             'batch', 'length')
     length_mask = length_mask.align_to(sample_log_probs)
     sample_log_probs = (sample_log_probs *
                         length_mask.float()).sum(dim='length')
     return samples, sample_log_probs
예제 #6
0
 def _get_Wh_s(self, h_s: FT) -> FT:
     sl, bs, ds = h_s.size()
     with NoName(h_s):
         Wh_s = h_s.reshape(sl * bs, -1).mm(self.Wa).view(sl, bs, -1)
     return Wh_s
예제 #7
0
    def _get_matches(self, extracted_word_repr: FT, unit_repr: FT,
                     viable_lens: LT, extracted_unit_ids: LT,
                     char_log_probs: FT) -> Matches:
        ns = extracted_word_repr.size('viable')
        len_w = extracted_word_repr.size('len_w')
        nt = len(self.vocab_feat_matrix)
        msl = extracted_word_repr.size('len_w')
        mtl = self.vocab_feat_matrix.size('length')

        # Compute cosine distances all at once: for each viable span, compare it against all units.
        ctx_logits = extracted_word_repr @ unit_repr.t()
        ctx_log_probs = ctx_logits.log_softmax(dim='unit').flatten(
            ['viable', 'len_w'], 'viable_X_len_w')
        with NoName(char_log_probs, extracted_unit_ids):
            global_log_probs = char_log_probs[extracted_unit_ids].rename(
                'viable_X_len_w', 'unit')
        weighted_log_probs = g.context_weight * ctx_log_probs + (
            1.0 - g.context_weight) * global_log_probs
        costs = -weighted_log_probs

        # Name: viable x len_w x unit
        costs = costs.unflatten('viable_X_len_w', [('viable', ns),
                                                   ('len_w', len_w)])

        # NOTE(j_luo) Use dictionary to save every state.
        fs = dict()
        for i in range(msl + 1):
            fs[(i, 0)] = get_zeros(ns, nt).fill_(i * self.ins_del_cost)
        for j in range(mtl + 1):
            fs[(0, j)] = get_zeros(ns, nt).fill_(j * self.ins_del_cost)

        # ------------------------ Main body: DP ----------------------- #

        # Transition.
        with NoName(self.indexed_segments, costs):
            for ls in range(1, msl + 1):
                min_lt = max(ls - 2, 1)
                max_lt = min(ls + 2, mtl + 1)
                for lt in range(min_lt, max_lt):
                    transitions = list()
                    if (ls - 1, lt) in fs:
                        transitions.append(fs[(ls - 1, lt)] +
                                           self.ins_del_cost)
                    if (ls, lt - 1) in fs:
                        transitions.append(fs[(ls, lt - 1)] +
                                           self.ins_del_cost)
                    if (ls - 1, lt - 1) in fs:
                        vocab_inds = self.indexed_segments[:, lt - 1]
                        sub_cost = costs[:, ls - 1, vocab_inds]
                        transitions.append(fs[(ls - 1, lt - 1)] + sub_cost)
                    if transitions:
                        all_s = torch.stack(transitions, dim=-1)
                        new_s, _ = all_s.min(dim=-1)
                        fs[(ls, lt)] = new_s

        f_lst = list()
        for i in range(msl + 1):
            for j in range(mtl + 1):
                if (i, j) not in fs:
                    fs[(i, j)] = get_zeros(ns, nt).fill_(9999.9)
                f_lst.append(fs[(i, j)])
        f = torch.stack(f_lst, dim=0).view(msl + 1, mtl + 1, -1,
                                           len(self.vocab))
        f.rename_('len_w_src', 'len_w_tgt', 'viable', 'vocab')

        # Get the values wanted.
        with NoName(f, viable_lens, self.vocab_length):
            idx_src = viable_lens.unsqueeze(dim=-1)
            idx_tgt = self.vocab_length
            viable_i = get_range(ns, 2, 0)
            vocab_i = get_range(len(self.vocab_length), 2, 1)
            nll = f[idx_src, idx_tgt, viable_i, vocab_i]
            nll.rename_('viable', 'vocab')

        # Get the best spans.
        matches = Matches(-nll, f)
        return matches