Python PaddingUtils.pad_text示例，parlai.core.utils.PaddingUtils.pad_text Python示例

示例#1

0

显示文件

文件： seq2seq.py 项目： yoichimatsuyama/ParlAI

    def vectorize(self, observations):
        """Convert a list of observations into input & target tensors."""
        is_training = any(['labels' in obs for obs in observations])
        xs, ys, labels, valid_inds, _, _ = PaddingUtils.pad_text(
            observations,
            self.dict,
            end_idx=self.END_IDX,
            null_idx=self.NULL_IDX,
            dq=True,
            eval_labels=True,
            truncate=self.truncate)
        if xs is None:
            return None, None, None, None, None, None, None
        xs = torch.LongTensor(xs)
        if ys is not None:
            ys = torch.LongTensor(ys)
        if self.use_cuda:
            # copy to gpu
            self.xs.resize_(xs.size())
            self.xs.copy_(xs)
            xs = Variable(self.xs)
            if ys is not None:
                self.ys.resize_(ys.size())
                self.ys.copy_(ys)
                ys = Variable(self.ys)
        else:
            xs = Variable(xs)
            if ys is not None:
                ys = Variable(ys)

        cands = None
        valid_cands = None
        if not is_training and self.rank:
            # set up candidates
            cands = []
            valid_cands = []
            for i, v in enumerate(valid_inds):
                if 'label_candidates' in observations[v]:
                    curr_lcs = list(observations[v]['label_candidates'])
                    curr_cands = [{'text': c} for c in curr_lcs]
                    cs, _, _, valid_c_inds, *_ = PaddingUtils.pad_text(
                        curr_cands,
                        self.dict,
                        null_idx=self.NULL_IDX,
                        dq=True,
                        truncate=self.truncate)
                    valid_cands.append(
                        (i, v, [curr_lcs[j] for j in valid_c_inds]))
                    cs = torch.LongTensor(cs)
                    if self.use_cuda:
                        cs = cs.cuda()
                    cands.append(cs)

        return xs, ys, labels, valid_inds, cands, valid_cands, is_training

示例#2

0

显示文件

文件： ibm_seq2seq.py 项目： zcth428/ParlAI

    def vectorize(self, observations):
        """Convert a list of observations into input & target tensors."""
        is_training = any(['labels' in obs for obs in observations])
        xs, ys, labels, valid_inds, _, _ = PaddingUtils.pad_text(
            observations,
            self.dict,
            end_idx=None,
            null_idx=self.NULL_IDX,
            dq=True,
            eval_labels=True,
            truncate=self.truncate)
        if xs is None:
            return None, None, None, None, None, None, None
        xs = torch.LongTensor(xs)
        ys = torch.LongTensor(ys)
        if self.use_cuda:
            # copy to gpu
            self.xs.resize_(xs.size())
            self.xs.copy_(xs)
            xs = Variable(self.xs)
            if ys is not None:
                self.ys.resize_(ys.size())
                self.ys.copy_(ys)
                ys = Variable(self.ys)
        else:
            xs = Variable(xs)
            if ys is not None:
                ys = Variable(ys)

        return xs, ys, labels, valid_inds, is_training

示例#3

0

显示文件

    def vectorize(self, observations):
        """Convert a list of observations into input & target tensors."""
        is_training = any(('labels' in obs for obs in observations))
        # utility function for padding text and returning lists of indices
        # parsed using the provided dictionary
        xs, ys, labels, valid_inds, _, _ = PaddingUtils.pad_text(
            observations,
            self.dict,
            end_idx=self.END_IDX,
            null_idx=self.NULL_IDX,
            dq=False,
            eval_labels=True)
        if xs is None:
            return None, None, None, None, None

        # move lists of indices returned above into tensors
        xs = torch.LongTensor(xs)
        if self.use_cuda:
            xs = xs.cuda()
        xs = Variable(xs)

        if ys is not None:
            ys = torch.LongTensor(ys)
            if self.use_cuda:
                ys = ys.cuda()
            ys = Variable(ys)

        return xs, ys, labels, valid_inds, is_training

示例#4

0

显示文件

文件： seq2seq.py 项目： ahiroto/ParlAI

    def vectorize(self, observations):
        """Convert a list of observations into input & target tensors."""
        ys = None
        xs, ys, labels, valid_inds, _, _ = PaddingUtils.pad_text(
            observations, self.dict, self.END_IDX, self.NULL_IDX, dq=True,
            eval_labels=False, truncate=self.truncate)
        if xs is None:
            return None, None, None, None, None, None
        if self.use_cuda:
            # copy to gpu
            self.xs.resize_(xs.size())
            self.xs.copy_(xs, async=True)
            xs = Variable(self.xs)
            if ys is not None:
                self.ys.resize_(ys.size())
                self.ys.copy_(ys, async=True)
                ys = Variable(self.ys)
        else:
            xs = Variable(xs)
            if ys is not None:
                ys = Variable(ys)

        # set up candidates
        cands = None
        valid_cands = None
        if ys is None and self.rank:
            # only do ranking when no targets available and ranking flag set
            parsed_cs = []
            valid_cands = []
            for i, v in enumerate(valid_inds):
                if 'label_candidates' in observations[v]:
                    # each candidate tuple is a pair of the parsed version and
                    # the original full string
                    cs = list(observations[v]['label_candidates'])
                    curr_dqs = [deque(maxlen=self.truncate) for _ in cs]
                    for dq, c in zip(curr_dqs, cs):
                        dq.extendleft(reversed(self.parse(c)))
                    parsed_cs.append(curr_dqs)
                    valid_cands.append((i, v, cs))
            if len(parsed_cs) > 0:
                # TODO: store lengths of cands separately, so don't have zero
                #       padding for varying number of cands per example
                # found cands, pack them into tensor
                max_c_len = max(max(len(c) for c in cs) for cs in parsed_cs)
                max_c_cnt = max(len(cs) for cs in parsed_cs)
                for cs in parsed_cs:
                    for c in cs:
                        c += [self.NULL_IDX] * (max_c_len - len(c))
                    cs += [self.NULL_IDX] * (max_c_cnt - len(cs))
                cands = torch.LongTensor(parsed_cs)
                if self.use_cuda:
                    # copy to gpu
                    self.cands.resize_(cands.size())
                    self.cands.copy_(cands, async=True)
                    cands = Variable(self.cands)
                else:
                    cands = Variable(cands)

        return xs, ys, labels, valid_inds, cands, valid_cands

示例#5

0

显示文件

    def vectorize(self, observations, seq_len, is_training):
        """Convert a list of observations into input & target tensors."""
        labels = None
        valid_inds = None
        y_lens = None
        if is_training:
            for obs in observations:
                if obs:
                    if 'text2vec' in obs:
                        self.next_batch += obs['text2vec']
            if len(self.next_batch) <= self.batchsize:
                return None, None, None, None, None
            else:
                data_list = []
                targets_list = []
                # total is the number of batches
                total = len(self.next_batch) // self.batchsize
                for _ in range(total):
                    batch = self.next_batch[:self.batchsize]
                    self.next_batch = self.next_batch[self.batchsize:]

                    source = torch.LongTensor(batch).t().contiguous()
                    data = Variable(source[:seq_len])
                    targets = Variable(source[1:])

                    if self.use_cuda:
                        data = data.cuda()
                        targets = targets.cuda()

                    data_list.append(data)
                    targets_list.append(targets)
        else:
            # here we get valid examples and pad them with zeros
            xs, ys, labels, valid_inds, _, y_lens = PaddingUtils.pad_text(
                observations,
                self.dict,
                end_idx=self.END_IDX,
                null_idx=self.NULL_IDX)

            if self.use_cuda:
                if xs is not None:
                    xs = Variable(torch.LongTensor(xs)).cuda()
                if ys is not None:
                    ys = Variable(torch.LongTensor(ys)).cuda()
            else:
                if xs is not None:
                    xs = Variable(torch.LongTensor(xs))
                if ys is not None:
                    ys = Variable(torch.LongTensor(ys))
            data_list = [xs]
            targets_list = [ys]

        return data_list, targets_list, labels, valid_inds, y_lens

示例#6

0

显示文件

文件： language_model.py 项目： ahiroto/ParlAI

    def vectorize(self, observations, seq_len, is_training):
        """Convert a list of observations into input & target tensors."""
        labels = None
        valid_inds = None
        y_lens = None
        if is_training:
            for obs in observations:
                if obs:
                    if 'text2vec' in obs:
                        self.next_batch += obs['text2vec']
            if len(self.next_batch) <= self.batchsize:
                return None, None, None, None, None
            else:
                data_list = []
                targets_list = []
                # total is the number of batches
                total = len(self.next_batch)//self.batchsize
                for i in range(total):
                    batch = self.next_batch[:self.batchsize]
                    self.next_batch = self.next_batch[self.batchsize:]

                    source = torch.LongTensor(batch).t().contiguous()
                    data = Variable(source[:seq_len])
                    targets = Variable(source[1:])

                    if self.use_cuda:
                        data = data.cuda()
                        targets = targets.cuda()

                    data_list.append(data)
                    targets_list.append(targets)
        else:
            # here we get valid examples and pad them with zeros
            xs, ys, labels, valid_inds, _, y_lens = PaddingUtils.pad_text(
                observations, self.dict, self.END_IDX, self.NULL_IDX)
            if self.use_cuda:
                xs = Variable(xs).cuda()
                ys = Variable(ys).cuda()
            else:
                xs = Variable(xs)
                ys = Variable(ys)
            data_list = [xs]
            targets_list = [ys]

        return data_list, targets_list, labels, valid_inds, y_lens

示例#7

0

显示文件

文件： seq2seq.py 项目： tony-blake/ParlAI

    def vectorize(self, observations):
        """Convert a list of observations into input & target tensors."""
        ys = None
        xs, ys, labels, valid_inds, _, _ = PaddingUtils.pad_text(
            observations,
            self.dict,
            self.END_IDX,
            self.NULL_IDX,
            dq=True,
            eval_labels=False,
            truncate=self.truncate)
        if xs is None:
            return None, None, None, None, None, None
        if self.use_cuda:
            # copy to gpu
            self.xs.resize_(xs.size())
            self.xs.copy_(xs, async=True)
            xs = Variable(self.xs)
            if ys is not None:
                self.ys.resize_(ys.size())
                self.ys.copy_(ys, async=True)
                ys = Variable(self.ys)
        else:
            xs = Variable(xs)
            if ys is not None:
                ys = Variable(ys)

        # set up candidates
        cands = None
        valid_cands = None
        if ys is None and self.rank:
            # only do ranking when no targets available and ranking flag set
            parsed_cs = []
            valid_cands = []
            for i, v in enumerate(valid_inds):
                if 'label_candidates' in observations[v]:
                    # each candidate tuple is a pair of the parsed version and
                    # the original full string
                    cs = list(observations[v]['label_candidates'])
                    curr_dqs = [deque(maxlen=self.truncate) for _ in cs]
                    for dq, c in zip(curr_dqs, cs):
                        dq.extendleft(reversed(self.parse(c)))
                    parsed_cs.append(curr_dqs)
                    valid_cands.append((i, v, cs))
            if len(parsed_cs) > 0:
                # TODO: store lengths of cands separately, so don't have zero
                #       padding for varying number of cands per example
                # found cands, pack them into tensor
                max_c_len = max(max(len(c) for c in cs) for cs in parsed_cs)
                max_c_cnt = max(len(cs) for cs in parsed_cs)
                for cs in parsed_cs:
                    for c in cs:
                        c += [self.NULL_IDX] * (max_c_len - len(c))
                    cs += [self.NULL_IDX] * (max_c_cnt - len(cs))
                cands = torch.LongTensor(parsed_cs)
                if self.use_cuda:
                    # copy to gpu
                    self.cands.resize_(cands.size())
                    self.cands.copy_(cands, async=True)
                    cands = Variable(self.cands)
                else:
                    cands = Variable(cands)

        return xs, ys, labels, valid_inds, cands, valid_cands