Python padded_3d示例，parlai.utils.torch.padded_3d Python示例

示例#1

0

显示文件

    def score_candidates(self, batch, cand_vecs, cand_encs=None):
        """
        Score candidates.
        """
        # convoluted check that not all memories are empty
        if (
            self.opt['use_memories']
            and batch.memory_vecs is not None
            and sum(len(m) for m in batch.memory_vecs)
        ):
            mems = padded_3d(
                batch.memory_vecs, use_cuda=self.use_cuda, pad_idx=self.NULL_IDX
            )
        else:
            mems = None

        if cand_encs is not None:
            # we pre-encoded the candidates, do not re-encode here
            cand_vecs = None

        context_h, cands_h = self.model(xs=batch.text_vec, mems=mems, cands=cand_vecs)

        if cand_encs is not None:
            cands_h = cand_encs
        scores = self._score(context_h, cands_h)

        return scores

示例#2

0

显示文件

文件： bi_encoder_ranker.py 项目： simplecoka/cortx

    def set_vocab_candidates(self, shared):
        """
        Load the tokens from the vocab as candidates.

        self.vocab_candidates will contain a [num_cands] list of strings
        self.vocab_candidate_vecs will contain a [num_cands, 1] LongTensor
        """
        self.opt['encode_candidate_vecs'] = True
        if shared:
            self.vocab_candidates = shared['vocab_candidates']
            self.vocab_candidate_vecs = shared['vocab_candidate_vecs']
            self.vocab_candidate_encs = shared['vocab_candidate_encs']
        else:
            if 'vocab' in (self.opt['candidates'],
                           self.opt['eval_candidates']):
                cands = []
                vecs = []
                for ind in range(1, len(self.dict)):
                    txt = self.dict[ind]
                    cands.append(txt)
                    vecs.append(
                        self._vectorize_text(
                            txt,
                            add_start=True,
                            add_end=True,
                            truncate=self.label_truncate,
                        ))
                self.vocab_candidates = cands
                self.vocab_candidate_vecs = padded_3d([vecs]).squeeze(0)
                print("[ Loaded fixed candidate set (n = {}) from vocabulary ]"
                      "".format(len(self.vocab_candidates)))
                enc_path = self.opt.get('model_file') + '.vocab.encs'
                if PathManager.exists(enc_path):
                    self.vocab_candidate_encs = self.load_candidates(
                        enc_path, cand_type='vocab encodings')
                else:
                    cand_encs = []
                    vec_batches = [
                        self.vocab_candidate_vecs[i:i + 512]
                        for i in range(0, len(self.vocab_candidate_vecs), 512)
                    ]
                    print("[ Vectorizing vocab candidates ({} batch(es) of up "
                          "to 512) ]".format(len(vec_batches)))
                    for vec_batch in tqdm(vec_batches):
                        cand_encs.append(self.encode_candidates(vec_batch))
                    self.vocab_candidate_encs = torch.cat(cand_encs, 0)
                    self.save_candidates(self.vocab_candidate_encs,
                                         enc_path,
                                         cand_type='vocab encodings')
                if self.use_cuda:
                    self.vocab_candidate_vecs = self.vocab_candidate_vecs.cuda(
                    )
                    self.vocab_candidate_encs = self.vocab_candidate_encs.cuda(
                    )
            else:
                self.vocab_candidates = None
                self.vocab_candidate_vecs = None
                self.vocab_candidate_encs = None

示例#3

0

显示文件

文件： ranker.py 项目： convobox/ParlAI

 def _make_candidate_vecs(self, cands):
     """
     Prebuild cached vectors for fixed candidates.
     """
     cand_batches = [cands[i:i + 512] for i in range(0, len(cands), 512)]
     cand_vecs = []
     for batch in cand_batches:
         cand_vecs.extend(self.vectorize_fixed_candidates(batch))
     return padded_3d([cand_vecs],
                      pad_idx=self.NULL_IDX,
                      dtype=cand_vecs[0].dtype).squeeze(0)

示例#4

0

显示文件

def __text_batchify_multi_turn(model_agent, batch_samp):
    samp_x = padded_3d(batch_samp, model_agent.NULL_IDX, model_agent.use_cuda)
    samp_x_lens = (samp_x != model_agent.NULL_IDX).sum(dim=-1)
    samp_context_lens = (samp_x_lens != 0).sum(dim=-1)
    samp_floors, _ = model_agent._pad_tensor(
        [make_floor(c_len.item()) for c_len in samp_context_lens])
    return {
        'text_vec': samp_x,
        'text_lengths': samp_x_lens,
        'context_lens': samp_context_lens,
        'floors': samp_floors
    }

示例#5

0

显示文件

文件： torch_ranker_agent.py 项目： christiancosgrove/cs767hw2

 def _make_candidate_vecs(self, cands):
     """
     Prebuild cached vectors for fixed candidates.
     """
     cand_batches = [cands[i:i + 512] for i in range(0, len(cands), 512)]
     print("[ Vectorizing fixed candidate set ({} batch(es) of up to 512) ]"
           "".format(len(cand_batches)))
     cand_vecs = []
     for batch in tqdm(cand_batches):
         cand_vecs.extend(self.vectorize_fixed_candidates(batch))
     return padded_3d([cand_vecs],
                      pad_idx=self.NULL_IDX,
                      dtype=cand_vecs[0].dtype).squeeze(0)

示例#6

0

显示文件

 def _set_batch_gold_doc_vec(self, valid_exs: List[Message], batch: Batch) -> Batch:
     """
     Set the gold docs vecs for the batch.
     """
     docs = []
     titles = []
     num_docs = []
     for ex in valid_exs:
         if ex.get('gold_doc_vec') is not None:
             ds, _ = self._pad_tensor(ex['gold_doc_vec'])
             ts, _ = self._pad_tensor(ex['gold_doc_title_vec'])
             docs.append(ds)
             titles.append(ts)
             num_docs.append(len(ex['gold_doc_vec']))
         else:
             docs.append(self.EMPTY.unsqueeze(0))
             titles.append(self.EMPTY.unsqueeze(0))
             num_docs.append(0)
     batch.gold_doc_vec = padded_3d(docs)
     batch.gold_doc_title_vec = padded_3d(titles)
     batch.num_gold_docs = torch.LongTensor(num_docs)
     return batch

示例#7

0

显示文件

 def _set_batch_memory_vec(self, valid_exs: List[Message], batch: Batch) -> Batch:
     """
     Set the memory vec for the batch.
     """
     mems = []
     num_mems = []
     for ex in valid_exs:
         if ex.get('memory_vec') is not None:
             ms, _ = self._pad_tensor(ex['memory_vec'])
             mems.append(ms)
             num_mems.append(len(ex['memory_vec']))
         else:
             num_mems.append(0)
     batch.memory_vec = padded_3d(mems)
     batch.num_memories = torch.LongTensor(num_mems)
     return batch

示例#8

0

显示文件

文件： blenderbot2.py 项目： skywalker023/ParlAI

 def _set_batch_memory_decoder_vec(self, valid_exs: List[Message],
                                   batch: Batch) -> Batch:
     """
     Set the memory decoder vec for the batch.
     """
     memory_dec_toks = []
     num_memory_dec_toks = []
     for ex in valid_exs:
         if ex.get('memory_decoder_vec') is not None:
             p_sum_vecs, _lens = self._pad_tensor(ex['memory_decoder_vec'])
             memory_dec_toks.append(p_sum_vecs)
             num_memory_dec_toks.append(len(ex['memory_decoder_vec']))
         else:
             num_memory_dec_toks.append(0)
     batch.memory_decoder_vec = padded_3d(memory_dec_toks)
     batch.num_memory_decoder_vecs = torch.LongTensor(num_memory_dec_toks)
     return batch

示例#9

0

显示文件

    def _build_candidates(self, batch, source, mode):
        """
        Build a candidate set for this batch.

        :param batch:
            a Batch object (defined in torch_agent.py)
        :param source:
            the source from which candidates should be built, one of
            ['batch', 'batch-all-cands', 'inline', 'fixed']
        :param mode:
            'train' or 'eval'

        :return: tuple of tensors (label_inds, cands, cand_vecs)

            label_inds: A [bsz] LongTensor of the indices of the labels for each
                example from its respective candidate set
            cands: A [num_cands] list of (text) candidates
                OR a [batchsize] list of such lists if source=='inline'
            cand_vecs: A padded [num_cands, seqlen] LongTensor of vectorized candidates
                OR a [batchsize, num_cands, seqlen] LongTensor if source=='inline'

        Possible sources of candidates:

            * batch: the set of all labels in this batch
                Use all labels in the batch as the candidate set (with all but the
                example's label being treated as negatives).
                Note: with this setting, the candidate set is identical for all
                examples in a batch. This option may be undesirable if it is possible
                for duplicate labels to occur in a batch, since the second instance of
                the correct label will be treated as a negative.
            * batch-all-cands: the set of all candidates in this batch
                Use all candidates in the batch as candidate set.
                Note 1: This can result in a very large number of candidates.
                Note 2: In this case we will deduplicate candidates.
                Note 3: just like with 'batch' the candidate set is identical
                for all examples in a batch.
            * inline: batch_size lists, one list per example
                If each example comes with a list of possible candidates, use those.
                Note: With this setting, each example will have its own candidate set.
            * fixed: one global candidate list, provided in a file from the user
                If self.fixed_candidates is not None, use a set of fixed candidates for
                all examples.
                Note: this setting is not recommended for training unless the
                universe of possible candidates is very small.
            * vocab: one global candidate list, extracted from the vocabulary with the
                exception of self.NULL_IDX.
        """
        label_vecs = batch.label_vec  # [bsz] list of lists of LongTensors
        label_inds = None
        batchsize = (
            batch.text_vec.size(0)
            if batch.text_vec is not None
            else batch.image.size(0)
        )

        if label_vecs is not None:
            assert label_vecs.dim() == 2

        if source == 'batch':
            warn_once(
                '[ Executing {} mode with batch labels as set of candidates. ]'
                ''.format(mode)
            )
            if batchsize == 1:
                warn_once(
                    "[ Warning: using candidate source 'batch' and observed a "
                    "batch of size 1. This may be due to uneven batch sizes at "
                    "the end of an epoch. ]"
                )
            if label_vecs is None:
                raise ValueError(
                    "If using candidate source 'batch', then batch.label_vec cannot be "
                    "None."
                )

            cands = batch.labels
            cand_vecs = label_vecs
            label_inds = label_vecs.new_tensor(range(batchsize))

        elif source == 'batch-all-cands':
            warn_once(
                '[ Executing {} mode with all candidates provided in the batch ]'
                ''.format(mode)
            )
            if batch.candidate_vecs is None:
                raise ValueError(
                    "If using candidate source 'batch-all-cands', then batch."
                    "candidate_vecs cannot be None. If your task does not have "
                    "inline candidates, consider using one of "
                    "--{m}={{'batch','fixed','vocab'}}."
                    "".format(m='candidates' if mode == 'train' else 'eval-candidates')
                )
            # initialize the list of cands with the labels
            cands = []
            all_cands_vecs = []
            # dictionary used for deduplication
            cands_to_id = {}
            for i, cands_for_sample in enumerate(batch.candidates):
                for j, cand in enumerate(cands_for_sample):
                    if cand not in cands_to_id:
                        cands.append(cand)
                        cands_to_id[cand] = len(cands_to_id)
                        all_cands_vecs.append(batch.candidate_vecs[i][j])
            cand_vecs, _ = self._pad_tensor(all_cands_vecs)
            label_inds = label_vecs.new_tensor(
                [cands_to_id[label] for label in batch.labels]
            )

        elif source == 'inline':
            warn_once(
                '[ Executing {} mode with provided inline set of candidates ]'
                ''.format(mode)
            )
            if batch.candidate_vecs is None:
                raise ValueError(
                    "If using candidate source 'inline', then batch.candidate_vecs "
                    "cannot be None. If your task does not have inline candidates, "
                    "consider using one of --{m}={{'batch','fixed','vocab'}}."
                    "".format(m='candidates' if mode == 'train' else 'eval-candidates')
                )

            cands = batch.candidates
            cand_vecs = padded_3d(
                batch.candidate_vecs,
                self.NULL_IDX,
                use_cuda=self.use_cuda,
                fp16friendly=self.fp16,
            )
            if label_vecs is not None:
                label_inds = label_vecs.new_empty((batchsize))
                bad_batch = False
                for i, label_vec in enumerate(label_vecs):
                    label_vec_pad = label_vec.new_zeros(cand_vecs[i].size(1)).fill_(
                        self.NULL_IDX
                    )
                    if cand_vecs[i].size(1) < len(label_vec):
                        label_vec = label_vec[0 : cand_vecs[i].size(1)]
                    label_vec_pad[0 : label_vec.size(0)] = label_vec
                    label_inds[i] = self._find_match(cand_vecs[i], label_vec_pad)
                    if label_inds[i] == -1:
                        bad_batch = True
                if bad_batch:
                    if self.ignore_bad_candidates and not self.is_training:
                        label_inds = None
                    else:
                        raise RuntimeError(
                            'At least one of your examples has a set of label candidates '
                            'that does not contain the label. To ignore this error '
                            'set `--ignore-bad-candidates True`.'
                        )

        elif source == 'fixed':
            if self.fixed_candidates is None:
                raise ValueError(
                    "If using candidate source 'fixed', then you must provide the path "
                    "to a file of candidates with the flag --fixed-candidates-path or "
                    "the name of a task with --fixed-candidates-task."
                )
            warn_once(
                "[ Executing {} mode with a common set of fixed candidates "
                "(n = {}). ]".format(mode, len(self.fixed_candidates))
            )

            cands = self.fixed_candidates
            cand_vecs = self.fixed_candidate_vecs

            if label_vecs is not None:
                label_inds = label_vecs.new_empty((batchsize))
                bad_batch = False
                for batch_idx, label_vec in enumerate(label_vecs):
                    max_c_len = cand_vecs.size(1)
                    label_vec_pad = label_vec.new_zeros(max_c_len).fill_(self.NULL_IDX)
                    if max_c_len < len(label_vec):
                        label_vec = label_vec[0:max_c_len]
                    label_vec_pad[0 : label_vec.size(0)] = label_vec
                    label_inds[batch_idx] = self._find_match(cand_vecs, label_vec_pad)
                    if label_inds[batch_idx] == -1:
                        bad_batch = True
                if bad_batch:
                    if self.ignore_bad_candidates and not self.is_training:
                        label_inds = None
                    else:
                        raise RuntimeError(
                            'At least one of your examples has a set of label candidates '
                            'that does not contain the label. To ignore this error '
                            'set `--ignore-bad-candidates True`.'
                        )

        elif source == 'vocab':
            warn_once(
                '[ Executing {} mode with tokens from vocabulary as candidates. ]'
                ''.format(mode)
            )
            cands = self.vocab_candidates
            cand_vecs = self.vocab_candidate_vecs
            # NOTE: label_inds is None here, as we will not find the label in
            # the set of vocab candidates
        else:
            raise Exception("Unrecognized source: %s" % source)

        return (cands, cand_vecs, label_inds)

示例#10

0

显示文件

文件： dialog_wae.py 项目： zeta1999/ContrastiveLearning4Dialogue

    def batchify(self, obs_batch, sort=False):
        """
        Create a batch of valid observations from an unchecked batch.

        A valid observation is one that passes the lambda provided to the
        function, which defaults to checking if the preprocessed 'text_vec'
        field is present which would have been set by this agent's 'vectorize'
        function.

        Returns a namedtuple Batch. See original definition above for in-depth
        explanation of each field.

        If you want to include additonal fields in the batch, you can subclass
        this function and return your own "Batch" namedtuple: copy the Batch
        namedtuple at the top of this class, and then add whatever additional
        fields that you want to be able to access. You can then call
        super().batchify(...) to set up the original fields and then set up the
        additional fields in your subclass and return that batch instead.

        :param obs_batch:
            List of vectorized observations

        :param sort:
            Default False, orders the observations by length of vectors. Set to
            true when using torch.nn.utils.rnn.pack_padded_sequence.  Uses the text
            vectors if available, otherwise uses the label vectors if available.
        """
        if len(obs_batch) == 0:
            return Batch()

        valid_obs = [(i, ex) for i, ex in enumerate(obs_batch)
                     if self.is_valid(ex)]

        if len(valid_obs) == 0:
            return Batch()

        valid_inds, exs = zip(*valid_obs)

        # TEXT
        xs, x_lens, context_lens, floors = None, None, None, None
        if any('text_vec' in ex for ex in exs):
            _xs = [ex.get('text_vec', [self.EMPTY]) for ex in exs]
            xs = padded_3d(
                _xs,
                self.NULL_IDX,
                self.use_cuda,
                fp16friendly=self.opt.get('fp16'),
            )
            x_lens = (xs != self.NULL_IDX).sum(dim=-1)  # bsz, context_len
            context_lens = (x_lens != 0).sum(dim=-1)  # bsz
            floors, _ = padded_tensor(
                [make_floor(c_len.item()) for c_len in context_lens],
                use_cuda=self.use_cuda)
            # We do not sort on the xs which in the shape of [bsz, context_len, utt_len] is this agent
            # if sort:
            #     sort = False  # now we won't sort on labels
            #     xs, x_lens, valid_inds, exs = argsort(
            #         x_lens, xs, x_lens, valid_inds, exs, descending=True
            #     )

        # LABELS
        labels_avail = any('labels_vec' in ex for ex in exs)
        some_labels_avail = (labels_avail
                             or any('eval_labels_vec' in ex for ex in exs))

        ys, y_lens, labels = None, None, None
        if some_labels_avail:
            field = 'labels' if labels_avail else 'eval_labels'

            label_vecs = [ex.get(field + '_vec', self.EMPTY) for ex in exs]
            labels = [ex.get(field + '_choice') for ex in exs]
            y_lens = [y.shape[0] for y in label_vecs]

            ys, y_lens = padded_tensor(label_vecs,
                                       self.NULL_IDX,
                                       self.use_cuda,
                                       fp16friendly=self.opt.get('fp16'))
            y_lens = torch.LongTensor(y_lens)
            if self.use_cuda:
                y_lens = y_lens.cuda()
            # We do not sort examples in batch for this agent
            # if sort and xs is None:
            #     ys, valid_inds, label_vecs, labels, y_lens = argsort(
            #         y_lens, ys, valid_inds, label_vecs, labels, y_lens,
            #         descending=True
            #     )

        # LABEL_CANDIDATES
        cands, cand_vecs = None, None
        if any('label_candidates_vecs' in ex for ex in exs):
            cands = [ex.get('label_candidates', None) for ex in exs]
            cand_vecs = [ex.get('label_candidates_vecs', None) for ex in exs]

        # IMAGE
        imgs = None
        if any('image' in ex for ex in exs):
            imgs = [ex.get('image', None) for ex in exs]

        return Batch(text_vec=xs,
                     text_lengths=x_lens,
                     context_lens=context_lens,
                     floors=floors,
                     label_vec=ys,
                     label_lengths=y_lens,
                     labels=labels,
                     valid_indices=valid_inds,
                     candidates=cands,
                     candidate_vecs=cand_vecs,
                     image=imgs,
                     observations=exs)