def gumbel_softmax(logits: FT, temperature: float, num_samples: Optional[int] = None) -> Tuple[FT, FT, LT]: """Sample from the Gumbel-Softmax distribution and optionally discretize.""" logits = logits.align_to('batch', 'length', 'label') y = gumbel_softmax_sample(logits, temperature, num_samples) y = y.align_to('batch', 'length', 'label', ...) max_values, max_inds = y.max(dim='label') y_one_hot = (max_values.align_as(y) == y).float() y_one_hot = (y_one_hot - y).detach() + y bi = get_named_range(logits.size('batch'), 'batch').align_as(max_inds) li = get_named_range(logits.size('length'), 'length').align_as(max_inds) if num_samples is None: with NoName(max_inds, y_one_hot, bi, li): probs = y_one_hot[bi, li, max_inds] probs.rename_('batch', 'length') else: si = get_named_range(max_inds.size('sample'), 'sample').align_as(max_inds) with NoName(max_inds, y_one_hot, bi, li, si): probs = y_one_hot[bi, li, max_inds, si] probs.rename_('batch', 'length', 'sample') seq_probs = (1e-8 + probs).log().sum(dim='length').exp() return y, y_one_hot, max_inds, seq_probs
def search(self, sot_id: int, src_emb: FT, src_outputs: FT, src_paddings: BT, src_lengths: LT, beam_size: int, lang_emb: Optional[FT] = None) -> Hypotheses: if beam_size <= 0: raise ValueError(f'`beam_size` must be positive.') batch_size = src_emb.size('batch') tokens = torch.full([batch_size, beam_size], sot_id, dtype=torch.long).to(src_emb.device).rename( 'batch', 'beam') accum_scores = torch.full_like(tokens, -9999.9).float() accum_scores[:, 0] = 0.0 init_att = None if g.input_feeding: init_att = get_zeros(batch_size, beam_size, g.hidden_size).rename('batch', 'beam', 'hidden') lstm_state = LstmStatesByLayers.zero_state( self.cell.num_layers, batch_size, beam_size, self.attn.input_tgt_size, bidirectional=False, names=['batch', 'beam', 'hidden']) def expand_beam(orig, collapse: bool = True): if collapse: return torch.repeat_interleave(orig, beam_size, dim='batch') else: return duplicate(orig, 'batch', beam_size, 'beam') src_emb = expand_beam(src_emb) src_outputs = expand_beam(src_outputs) src_paddings = expand_beam(src_paddings) max_lengths = (src_lengths.float() * 1.5).long() max_lengths = expand_beam(max_lengths, collapse=False) constants = BeamConstant(src_emb, src_outputs, src_paddings, max_lengths, lang_emb=lang_emb) init_beam = Beam(0, accum_scores, tokens, lstm_state, constants, prev_att=init_att) hyps = super().search(init_beam) return hyps
def extend(self, label_log_probs: FT): num_labels = label_log_probs.size('label') label_log_probs = label_log_probs.align_to('batch', 'beam', 'label') new_hyp_log_probs = self.hyp_log_probs[-1].align_to( 'batch', 'beam', 'label') + label_log_probs new_hyp_log_probs = new_hyp_log_probs.flatten(['beam', 'label'], 'beam_X_label') top_values, top_inds = torch.topk(new_hyp_log_probs, g.beam_size, 'beam_X_label') beam_ids = top_inds // num_labels label_ids = top_inds % num_labels self.beam_ids.append(beam_ids.rename(beam_X_label='beam')) self.hyps.append(label_ids.rename(beam_X_label='beam')) self.hyp_log_probs.append(top_values.rename(beam_X_label='beam'))
def search_by_probs(self, lengths: LT, label_log_probs: FT) -> Tuple[LT, FT]: max_length = lengths.max().item() bs = label_log_probs.size('batch') label_log_probs = label_log_probs.align_to('length', 'batch', 'label') beam = Beam(bs) for step in range(max_length): __label_log_probs = label_log_probs[step] # __lengths = lengths[step] within_length = (step < lengths).align_as( __label_log_probs) # __lengths beam.extend(__label_log_probs * within_length.float()) beam.finish_search(lengths) samples = beam.samples.rename(beam='sample') sample_log_probs = beam.sample_log_probs.rename(beam='sample') return samples, sample_log_probs
def search_by_probs(self, lengths: LT, label_log_probs: FT) -> Tuple[LT, FT]: max_length = lengths.max().item() samples = get_tensor( torch.LongTensor(list(product([B, I, O], repeat=max_length)))) samples.rename_('sample', 'length') bs = label_log_probs.size('batch') samples = samples.align_to('batch', 'sample', 'length').expand(bs, -1, -1) sample_log_probs = label_log_probs.gather('label', samples) with NoName(lengths): length_mask = get_length_mask(lengths, max_length).rename( 'batch', 'length') length_mask = length_mask.align_to(sample_log_probs) sample_log_probs = (sample_log_probs * length_mask.float()).sum(dim='length') return samples, sample_log_probs
def _get_Wh_s(self, h_s: FT) -> FT: sl, bs, ds = h_s.size() with NoName(h_s): Wh_s = h_s.reshape(sl * bs, -1).mm(self.Wa).view(sl, bs, -1) return Wh_s
def _get_matches(self, extracted_word_repr: FT, unit_repr: FT, viable_lens: LT, extracted_unit_ids: LT, char_log_probs: FT) -> Matches: ns = extracted_word_repr.size('viable') len_w = extracted_word_repr.size('len_w') nt = len(self.vocab_feat_matrix) msl = extracted_word_repr.size('len_w') mtl = self.vocab_feat_matrix.size('length') # Compute cosine distances all at once: for each viable span, compare it against all units. ctx_logits = extracted_word_repr @ unit_repr.t() ctx_log_probs = ctx_logits.log_softmax(dim='unit').flatten( ['viable', 'len_w'], 'viable_X_len_w') with NoName(char_log_probs, extracted_unit_ids): global_log_probs = char_log_probs[extracted_unit_ids].rename( 'viable_X_len_w', 'unit') weighted_log_probs = g.context_weight * ctx_log_probs + ( 1.0 - g.context_weight) * global_log_probs costs = -weighted_log_probs # Name: viable x len_w x unit costs = costs.unflatten('viable_X_len_w', [('viable', ns), ('len_w', len_w)]) # NOTE(j_luo) Use dictionary to save every state. fs = dict() for i in range(msl + 1): fs[(i, 0)] = get_zeros(ns, nt).fill_(i * self.ins_del_cost) for j in range(mtl + 1): fs[(0, j)] = get_zeros(ns, nt).fill_(j * self.ins_del_cost) # ------------------------ Main body: DP ----------------------- # # Transition. with NoName(self.indexed_segments, costs): for ls in range(1, msl + 1): min_lt = max(ls - 2, 1) max_lt = min(ls + 2, mtl + 1) for lt in range(min_lt, max_lt): transitions = list() if (ls - 1, lt) in fs: transitions.append(fs[(ls - 1, lt)] + self.ins_del_cost) if (ls, lt - 1) in fs: transitions.append(fs[(ls, lt - 1)] + self.ins_del_cost) if (ls - 1, lt - 1) in fs: vocab_inds = self.indexed_segments[:, lt - 1] sub_cost = costs[:, ls - 1, vocab_inds] transitions.append(fs[(ls - 1, lt - 1)] + sub_cost) if transitions: all_s = torch.stack(transitions, dim=-1) new_s, _ = all_s.min(dim=-1) fs[(ls, lt)] = new_s f_lst = list() for i in range(msl + 1): for j in range(mtl + 1): if (i, j) not in fs: fs[(i, j)] = get_zeros(ns, nt).fill_(9999.9) f_lst.append(fs[(i, j)]) f = torch.stack(f_lst, dim=0).view(msl + 1, mtl + 1, -1, len(self.vocab)) f.rename_('len_w_src', 'len_w_tgt', 'viable', 'vocab') # Get the values wanted. with NoName(f, viable_lens, self.vocab_length): idx_src = viable_lens.unsqueeze(dim=-1) idx_tgt = self.vocab_length viable_i = get_range(ns, 2, 0) vocab_i = get_range(len(self.vocab_length), 2, 1) nll = f[idx_src, idx_tgt, viable_i, vocab_i] nll.rename_('viable', 'vocab') # Get the best spans. matches = Matches(-nll, f) return matches