def score_candidates(self, batch, cand_vecs, cand_encs=None): """ Score candidates. """ # convoluted check that not all memories are empty if ( self.opt['use_memories'] and batch.memory_vecs is not None and sum(len(m) for m in batch.memory_vecs) ): mems = padded_3d( batch.memory_vecs, use_cuda=self.use_cuda, pad_idx=self.NULL_IDX ) else: mems = None if cand_encs is not None: # we pre-encoded the candidates, do not re-encode here cand_vecs = None context_h, cands_h = self.model(xs=batch.text_vec, mems=mems, cands=cand_vecs) if cand_encs is not None: cands_h = cand_encs scores = self._score(context_h, cands_h) return scores
def set_vocab_candidates(self, shared): """ Load the tokens from the vocab as candidates. self.vocab_candidates will contain a [num_cands] list of strings self.vocab_candidate_vecs will contain a [num_cands, 1] LongTensor """ self.opt['encode_candidate_vecs'] = True if shared: self.vocab_candidates = shared['vocab_candidates'] self.vocab_candidate_vecs = shared['vocab_candidate_vecs'] self.vocab_candidate_encs = shared['vocab_candidate_encs'] else: if 'vocab' in (self.opt['candidates'], self.opt['eval_candidates']): cands = [] vecs = [] for ind in range(1, len(self.dict)): txt = self.dict[ind] cands.append(txt) vecs.append( self._vectorize_text( txt, add_start=True, add_end=True, truncate=self.label_truncate, )) self.vocab_candidates = cands self.vocab_candidate_vecs = padded_3d([vecs]).squeeze(0) print("[ Loaded fixed candidate set (n = {}) from vocabulary ]" "".format(len(self.vocab_candidates))) enc_path = self.opt.get('model_file') + '.vocab.encs' if PathManager.exists(enc_path): self.vocab_candidate_encs = self.load_candidates( enc_path, cand_type='vocab encodings') else: cand_encs = [] vec_batches = [ self.vocab_candidate_vecs[i:i + 512] for i in range(0, len(self.vocab_candidate_vecs), 512) ] print("[ Vectorizing vocab candidates ({} batch(es) of up " "to 512) ]".format(len(vec_batches))) for vec_batch in tqdm(vec_batches): cand_encs.append(self.encode_candidates(vec_batch)) self.vocab_candidate_encs = torch.cat(cand_encs, 0) self.save_candidates(self.vocab_candidate_encs, enc_path, cand_type='vocab encodings') if self.use_cuda: self.vocab_candidate_vecs = self.vocab_candidate_vecs.cuda( ) self.vocab_candidate_encs = self.vocab_candidate_encs.cuda( ) else: self.vocab_candidates = None self.vocab_candidate_vecs = None self.vocab_candidate_encs = None
def _make_candidate_vecs(self, cands): """ Prebuild cached vectors for fixed candidates. """ cand_batches = [cands[i:i + 512] for i in range(0, len(cands), 512)] cand_vecs = [] for batch in cand_batches: cand_vecs.extend(self.vectorize_fixed_candidates(batch)) return padded_3d([cand_vecs], pad_idx=self.NULL_IDX, dtype=cand_vecs[0].dtype).squeeze(0)
def __text_batchify_multi_turn(model_agent, batch_samp): samp_x = padded_3d(batch_samp, model_agent.NULL_IDX, model_agent.use_cuda) samp_x_lens = (samp_x != model_agent.NULL_IDX).sum(dim=-1) samp_context_lens = (samp_x_lens != 0).sum(dim=-1) samp_floors, _ = model_agent._pad_tensor( [make_floor(c_len.item()) for c_len in samp_context_lens]) return { 'text_vec': samp_x, 'text_lengths': samp_x_lens, 'context_lens': samp_context_lens, 'floors': samp_floors }
def _make_candidate_vecs(self, cands): """ Prebuild cached vectors for fixed candidates. """ cand_batches = [cands[i:i + 512] for i in range(0, len(cands), 512)] print("[ Vectorizing fixed candidate set ({} batch(es) of up to 512) ]" "".format(len(cand_batches))) cand_vecs = [] for batch in tqdm(cand_batches): cand_vecs.extend(self.vectorize_fixed_candidates(batch)) return padded_3d([cand_vecs], pad_idx=self.NULL_IDX, dtype=cand_vecs[0].dtype).squeeze(0)
def _set_batch_gold_doc_vec(self, valid_exs: List[Message], batch: Batch) -> Batch: """ Set the gold docs vecs for the batch. """ docs = [] titles = [] num_docs = [] for ex in valid_exs: if ex.get('gold_doc_vec') is not None: ds, _ = self._pad_tensor(ex['gold_doc_vec']) ts, _ = self._pad_tensor(ex['gold_doc_title_vec']) docs.append(ds) titles.append(ts) num_docs.append(len(ex['gold_doc_vec'])) else: docs.append(self.EMPTY.unsqueeze(0)) titles.append(self.EMPTY.unsqueeze(0)) num_docs.append(0) batch.gold_doc_vec = padded_3d(docs) batch.gold_doc_title_vec = padded_3d(titles) batch.num_gold_docs = torch.LongTensor(num_docs) return batch
def _set_batch_memory_vec(self, valid_exs: List[Message], batch: Batch) -> Batch: """ Set the memory vec for the batch. """ mems = [] num_mems = [] for ex in valid_exs: if ex.get('memory_vec') is not None: ms, _ = self._pad_tensor(ex['memory_vec']) mems.append(ms) num_mems.append(len(ex['memory_vec'])) else: num_mems.append(0) batch.memory_vec = padded_3d(mems) batch.num_memories = torch.LongTensor(num_mems) return batch
def _set_batch_memory_decoder_vec(self, valid_exs: List[Message], batch: Batch) -> Batch: """ Set the memory decoder vec for the batch. """ memory_dec_toks = [] num_memory_dec_toks = [] for ex in valid_exs: if ex.get('memory_decoder_vec') is not None: p_sum_vecs, _lens = self._pad_tensor(ex['memory_decoder_vec']) memory_dec_toks.append(p_sum_vecs) num_memory_dec_toks.append(len(ex['memory_decoder_vec'])) else: num_memory_dec_toks.append(0) batch.memory_decoder_vec = padded_3d(memory_dec_toks) batch.num_memory_decoder_vecs = torch.LongTensor(num_memory_dec_toks) return batch
def _build_candidates(self, batch, source, mode): """ Build a candidate set for this batch. :param batch: a Batch object (defined in torch_agent.py) :param source: the source from which candidates should be built, one of ['batch', 'batch-all-cands', 'inline', 'fixed'] :param mode: 'train' or 'eval' :return: tuple of tensors (label_inds, cands, cand_vecs) label_inds: A [bsz] LongTensor of the indices of the labels for each example from its respective candidate set cands: A [num_cands] list of (text) candidates OR a [batchsize] list of such lists if source=='inline' cand_vecs: A padded [num_cands, seqlen] LongTensor of vectorized candidates OR a [batchsize, num_cands, seqlen] LongTensor if source=='inline' Possible sources of candidates: * batch: the set of all labels in this batch Use all labels in the batch as the candidate set (with all but the example's label being treated as negatives). Note: with this setting, the candidate set is identical for all examples in a batch. This option may be undesirable if it is possible for duplicate labels to occur in a batch, since the second instance of the correct label will be treated as a negative. * batch-all-cands: the set of all candidates in this batch Use all candidates in the batch as candidate set. Note 1: This can result in a very large number of candidates. Note 2: In this case we will deduplicate candidates. Note 3: just like with 'batch' the candidate set is identical for all examples in a batch. * inline: batch_size lists, one list per example If each example comes with a list of possible candidates, use those. Note: With this setting, each example will have its own candidate set. * fixed: one global candidate list, provided in a file from the user If self.fixed_candidates is not None, use a set of fixed candidates for all examples. Note: this setting is not recommended for training unless the universe of possible candidates is very small. * vocab: one global candidate list, extracted from the vocabulary with the exception of self.NULL_IDX. """ label_vecs = batch.label_vec # [bsz] list of lists of LongTensors label_inds = None batchsize = ( batch.text_vec.size(0) if batch.text_vec is not None else batch.image.size(0) ) if label_vecs is not None: assert label_vecs.dim() == 2 if source == 'batch': warn_once( '[ Executing {} mode with batch labels as set of candidates. ]' ''.format(mode) ) if batchsize == 1: warn_once( "[ Warning: using candidate source 'batch' and observed a " "batch of size 1. This may be due to uneven batch sizes at " "the end of an epoch. ]" ) if label_vecs is None: raise ValueError( "If using candidate source 'batch', then batch.label_vec cannot be " "None." ) cands = batch.labels cand_vecs = label_vecs label_inds = label_vecs.new_tensor(range(batchsize)) elif source == 'batch-all-cands': warn_once( '[ Executing {} mode with all candidates provided in the batch ]' ''.format(mode) ) if batch.candidate_vecs is None: raise ValueError( "If using candidate source 'batch-all-cands', then batch." "candidate_vecs cannot be None. If your task does not have " "inline candidates, consider using one of " "--{m}={{'batch','fixed','vocab'}}." "".format(m='candidates' if mode == 'train' else 'eval-candidates') ) # initialize the list of cands with the labels cands = [] all_cands_vecs = [] # dictionary used for deduplication cands_to_id = {} for i, cands_for_sample in enumerate(batch.candidates): for j, cand in enumerate(cands_for_sample): if cand not in cands_to_id: cands.append(cand) cands_to_id[cand] = len(cands_to_id) all_cands_vecs.append(batch.candidate_vecs[i][j]) cand_vecs, _ = self._pad_tensor(all_cands_vecs) label_inds = label_vecs.new_tensor( [cands_to_id[label] for label in batch.labels] ) elif source == 'inline': warn_once( '[ Executing {} mode with provided inline set of candidates ]' ''.format(mode) ) if batch.candidate_vecs is None: raise ValueError( "If using candidate source 'inline', then batch.candidate_vecs " "cannot be None. If your task does not have inline candidates, " "consider using one of --{m}={{'batch','fixed','vocab'}}." "".format(m='candidates' if mode == 'train' else 'eval-candidates') ) cands = batch.candidates cand_vecs = padded_3d( batch.candidate_vecs, self.NULL_IDX, use_cuda=self.use_cuda, fp16friendly=self.fp16, ) if label_vecs is not None: label_inds = label_vecs.new_empty((batchsize)) bad_batch = False for i, label_vec in enumerate(label_vecs): label_vec_pad = label_vec.new_zeros(cand_vecs[i].size(1)).fill_( self.NULL_IDX ) if cand_vecs[i].size(1) < len(label_vec): label_vec = label_vec[0 : cand_vecs[i].size(1)] label_vec_pad[0 : label_vec.size(0)] = label_vec label_inds[i] = self._find_match(cand_vecs[i], label_vec_pad) if label_inds[i] == -1: bad_batch = True if bad_batch: if self.ignore_bad_candidates and not self.is_training: label_inds = None else: raise RuntimeError( 'At least one of your examples has a set of label candidates ' 'that does not contain the label. To ignore this error ' 'set `--ignore-bad-candidates True`.' ) elif source == 'fixed': if self.fixed_candidates is None: raise ValueError( "If using candidate source 'fixed', then you must provide the path " "to a file of candidates with the flag --fixed-candidates-path or " "the name of a task with --fixed-candidates-task." ) warn_once( "[ Executing {} mode with a common set of fixed candidates " "(n = {}). ]".format(mode, len(self.fixed_candidates)) ) cands = self.fixed_candidates cand_vecs = self.fixed_candidate_vecs if label_vecs is not None: label_inds = label_vecs.new_empty((batchsize)) bad_batch = False for batch_idx, label_vec in enumerate(label_vecs): max_c_len = cand_vecs.size(1) label_vec_pad = label_vec.new_zeros(max_c_len).fill_(self.NULL_IDX) if max_c_len < len(label_vec): label_vec = label_vec[0:max_c_len] label_vec_pad[0 : label_vec.size(0)] = label_vec label_inds[batch_idx] = self._find_match(cand_vecs, label_vec_pad) if label_inds[batch_idx] == -1: bad_batch = True if bad_batch: if self.ignore_bad_candidates and not self.is_training: label_inds = None else: raise RuntimeError( 'At least one of your examples has a set of label candidates ' 'that does not contain the label. To ignore this error ' 'set `--ignore-bad-candidates True`.' ) elif source == 'vocab': warn_once( '[ Executing {} mode with tokens from vocabulary as candidates. ]' ''.format(mode) ) cands = self.vocab_candidates cand_vecs = self.vocab_candidate_vecs # NOTE: label_inds is None here, as we will not find the label in # the set of vocab candidates else: raise Exception("Unrecognized source: %s" % source) return (cands, cand_vecs, label_inds)
def batchify(self, obs_batch, sort=False): """ Create a batch of valid observations from an unchecked batch. A valid observation is one that passes the lambda provided to the function, which defaults to checking if the preprocessed 'text_vec' field is present which would have been set by this agent's 'vectorize' function. Returns a namedtuple Batch. See original definition above for in-depth explanation of each field. If you want to include additonal fields in the batch, you can subclass this function and return your own "Batch" namedtuple: copy the Batch namedtuple at the top of this class, and then add whatever additional fields that you want to be able to access. You can then call super().batchify(...) to set up the original fields and then set up the additional fields in your subclass and return that batch instead. :param obs_batch: List of vectorized observations :param sort: Default False, orders the observations by length of vectors. Set to true when using torch.nn.utils.rnn.pack_padded_sequence. Uses the text vectors if available, otherwise uses the label vectors if available. """ if len(obs_batch) == 0: return Batch() valid_obs = [(i, ex) for i, ex in enumerate(obs_batch) if self.is_valid(ex)] if len(valid_obs) == 0: return Batch() valid_inds, exs = zip(*valid_obs) # TEXT xs, x_lens, context_lens, floors = None, None, None, None if any('text_vec' in ex for ex in exs): _xs = [ex.get('text_vec', [self.EMPTY]) for ex in exs] xs = padded_3d( _xs, self.NULL_IDX, self.use_cuda, fp16friendly=self.opt.get('fp16'), ) x_lens = (xs != self.NULL_IDX).sum(dim=-1) # bsz, context_len context_lens = (x_lens != 0).sum(dim=-1) # bsz floors, _ = padded_tensor( [make_floor(c_len.item()) for c_len in context_lens], use_cuda=self.use_cuda) # We do not sort on the xs which in the shape of [bsz, context_len, utt_len] is this agent # if sort: # sort = False # now we won't sort on labels # xs, x_lens, valid_inds, exs = argsort( # x_lens, xs, x_lens, valid_inds, exs, descending=True # ) # LABELS labels_avail = any('labels_vec' in ex for ex in exs) some_labels_avail = (labels_avail or any('eval_labels_vec' in ex for ex in exs)) ys, y_lens, labels = None, None, None if some_labels_avail: field = 'labels' if labels_avail else 'eval_labels' label_vecs = [ex.get(field + '_vec', self.EMPTY) for ex in exs] labels = [ex.get(field + '_choice') for ex in exs] y_lens = [y.shape[0] for y in label_vecs] ys, y_lens = padded_tensor(label_vecs, self.NULL_IDX, self.use_cuda, fp16friendly=self.opt.get('fp16')) y_lens = torch.LongTensor(y_lens) if self.use_cuda: y_lens = y_lens.cuda() # We do not sort examples in batch for this agent # if sort and xs is None: # ys, valid_inds, label_vecs, labels, y_lens = argsort( # y_lens, ys, valid_inds, label_vecs, labels, y_lens, # descending=True # ) # LABEL_CANDIDATES cands, cand_vecs = None, None if any('label_candidates_vecs' in ex for ex in exs): cands = [ex.get('label_candidates', None) for ex in exs] cand_vecs = [ex.get('label_candidates_vecs', None) for ex in exs] # IMAGE imgs = None if any('image' in ex for ex in exs): imgs = [ex.get('image', None) for ex in exs] return Batch(text_vec=xs, text_lengths=x_lens, context_lens=context_lens, floors=floors, label_vec=ys, label_lengths=y_lens, labels=labels, valid_indices=valid_inds, candidates=cands, candidate_vecs=cand_vecs, image=imgs, observations=exs)