def collate_fn(self, batch, padding=True):
        """
        Collate function needs to be passed to the pytorch dataloader

        Returns:
        (title,title_lengths): tuple containing padded sequence tensor for title and sequence lengths 
        (toc,toc_lengths): tuple containing padded sequence tensor for table of contents and sequence lengths 
        (intro,intro_lengths): tuple containing padded sequence tensor for introduction and sequence lengths 
        labels: tensor containing labels for the batch
        """
        if self.mode == 'train':
            title, toc, intro, labels = zip(*batch)
            labels = torch.cat(labels)
        else:
            title, toc, intro = zip(*batch)

        if isinstance(intro, collections.Sequence):

            if padding:
                title, title_lengths = stack_and_pad_tensors(title)
                toc, toc_lengths = stack_and_pad_tensors(toc)
                intro, intro_lengths = stack_and_pad_tensors(intro)

            if self.mode == 'train':
                return (title,
                        title_lengths), (toc,
                                         toc_lengths), (intro,
                                                        intro_lengths), labels
            else:
                return (title, title_lengths), (toc,
                                                toc_lengths), (intro,
                                                               intro_lengths)
        else:
            return batch
예제 #2
0
def collate_fn_rnn(batch):
    # # PyTorch RNN requires batches to be transposed for speed and integration with CUDA
    transpose = lambda b: b.t_().squeeze(0).contiguous()

    # Shape tensors in right format
    sents_len_batch = [[len(sent) for sent in doc["sents"]] for doc in batch]
    max_sent_len = max([max(s) for s in sents_len_batch])
    sents_batch, doc_lens_batch = stack_and_pad_tensors([
        torch.stack([pad_tensor(sent, max_sent_len) for sent in doc["sents"]])
        for doc in batch
    ])
    tags_batch, _ = stack_and_pad_tensors([doc["tags"] for doc in batch])

    # Move to device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    sents_batch = sents_batch.to(device)
    tags_batch = tags_batch.to(device)

    if "encoding" in batch[0].keys():  # add doc encoding if applicable
        encoding_batch = torch.stack([doc["encoding"]
                                      for doc in batch]).to(device)
        return (sents_batch, tags_batch, encoding_batch)

    # return (word_ids_batch, seq_len_batch, label_batch)
    return (sents_batch, tags_batch, None)
예제 #3
0
def collate_fn_transformer1(batch):

    # test = [sent for doc in batch for sent in doc['sents']]
    sents_batch, sents_len_batch = stack_and_pad_tensors(
        [sent for doc in batch for sent in doc["sents"]])
    doc_lens_batch = [len(doc["sents"]) for doc in batch]

    # tokens_batch, _ = stack_and_pad_tensors([doc['tokens'] for doc in batch])
    tags_batch, _ = stack_and_pad_tensors([doc["tags"] for doc in batch])
    # sents_len_batch = stack_and_pad_tensors([doc['sen_lens'] for doc in batch])
    # word_len_batch, _ = stack_and_pad_tensors([seq['word_len'] for seq in batch])

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    sents_batch = sents_batch.to(device)
    sents_len_batch = sents_len_batch.to(device)
    # doc_lens_batch = doc_lens_batch.to(device)
    tags_batch = tags_batch.to(device)

    if "encoding" in batch[0].keys():
        encoding_batch = torch.stack([doc["encoding"]
                                      for doc in batch]).to(device)
        return (
            sents_batch,
            sents_len_batch,
            doc_lens_batch,
            tags_batch,
            encoding_batch,
        )

    # return (word_ids_batch, seq_len_batch, label_batch)
    return (sents_batch, sents_len_batch, doc_lens_batch, tags_batch)
예제 #4
0
def collate_fn(batch, train=True):
    """ list of tensors to a batch tensors """
    premise_batch, _ = stack_and_pad_tensors([row['premise'] for row in batch])
    hypothesis_batch, _ = stack_and_pad_tensors(
        [row['hypothesis'] for row in batch])
    label_batch = torch.stack([row['label'] for row in batch])

    # PyTorch RNN requires batches to be transposed for speed and integration with CUDA
    transpose = (lambda b: b.t_().squeeze(0).contiguous())

    return (transpose(premise_batch), transpose(hypothesis_batch),
            transpose(label_batch))
예제 #5
0
    def batch_encode(self, iterator):
        ids, bounds, strict_masks, number_of_tokens = list(
            zip(*[self.encode(example) for example in iterator])
        )
        batch = stack_and_pad_tensors(ids, padding_index=self.padding_index, dim=0)
        bounds_batch = stack_and_pad_tensors(bounds, padding_index=-1, dim=0)
        masks_batch = stack_and_pad_tensors(strict_masks, padding_index=False, dim=0)
        number_of_tokens_batch = torch.tensor(number_of_tokens, dtype=torch.int)

        return BatchedSentence(
            tensor=batch.tensor,
            lengths=batch.lengths,
            bounds=bounds_batch.tensor,
            bounds_lengths=bounds_batch.lengths,
            strict_masks=masks_batch.tensor,
            number_of_tokens=number_of_tokens_batch,
        )
예제 #6
0
def collate_fn_eval_base(batch):
    """ list of tensors to a batch tensors """

    word_ids_batch, _ = stack_and_pad_tensors(
        [seq['word_ids'] for seq in batch])
    label_batch, _ = stack_and_pad_tensors([seq['labels'] for seq in batch])
    seq_len_batch = torch.LongTensor([len(seq['word_ids']) for seq in batch])

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    word_ids_batch = word_ids_batch.to(device)
    seq_len_batch = seq_len_batch.to(device)
    label_batch = label_batch.to(device)

    # PyTorch RNN requires batches to be transposed for speed and integration with CUDA
    transpose = (lambda b: b.t_().squeeze(0).contiguous())

    # return (word_ids_batch, seq_len_batch, label_batch)
    return (transpose(word_ids_batch), seq_len_batch, transpose(label_batch))
예제 #7
0
def collate_fn_infer(batch):
    """ list of tensors to a batch tensors """
    batch, _ = stack_and_pad_tensors([row['word_ids'] for row in batch])

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    batch = batch.to(device)

    # PyTorch RNN requires batches to be transposed for speed and integration with CUDA
    transpose = (lambda b: b.t_().squeeze(0).contiguous())

    return transpose(batch)
예제 #8
0
def test_stack_and_pad_tensors():
    batch = [
        torch.LongTensor([1, 2, 3]),
        torch.LongTensor([1, 2]),
        torch.LongTensor([1])
    ]
    padded, lengths = stack_and_pad_tensors(batch, DEFAULT_PADDING_INDEX)
    padded = [r.tolist() for r in padded]
    assert padded == [[1, 2, 3], [1, 2, DEFAULT_PADDING_INDEX],
                      [1, DEFAULT_PADDING_INDEX, DEFAULT_PADDING_INDEX]]
    assert lengths.tolist() == [3, 2, 1]
예제 #9
0
 def batch_encode_trackpos(self,
                           iterator,
                           dim=0,
                           **kwargs) -> (torch.Tensor, torch.Tensor):
     """
     :param iterator (iterator): Batch of text to encode.
     :param dim (int, optional): Dimension along which to concatenate tensors.
     :param **kwargs: Keyword arguments passed to 'encode'.
         
     Returns
         torch.Tensor, torch.Tensor: Encoded and padded batch of sequences; Original lengths of
             sequences.
     """
     sequences, tags = zip(
         *[self.encode_trackpos(object_) for object_ in iterator])
     sequences, seq_lengths = stack_and_pad_tensors(
         sequences, padding_index=self.padding_index, dim=dim)
     tag_idxs, tag_lengths = stack_and_pad_tensors(tags,
                                                   padding_index=0,
                                                   dim=dim)
     return sequences, seq_lengths, tag_idxs, tag_lengths
예제 #10
0
def preprocess_request(sentence, start_sign, end_sign, token, max_length):
    sentence = " ".join(jieba.cut(sentence))
    sentence = preprocess_sentence(start_sign, end_sign, sentence)
    inputs = [token.get(i, 3) for i in sentence.split(' ')]
    inputs = torch.tensor(inputs)
    inputs = [
        pad_tensor(tensor=inputs[:max_length],
                   length=max_length,
                   padding_index=0)
    ]
    inputs = stack_and_pad_tensors(inputs)[0]
    dec_input = torch.unsqueeze(torch.tensor([token[start_sign]]), 0)

    return inputs, dec_input
예제 #11
0
def collate_fn_transformer(batch):  # multigpu implementation

    # Shape tensors in right format
    sents_len_batch = [[len(sent) for sent in doc["sents"]] for doc in batch]
    max_sent_len = max([max(s) for s in sents_len_batch])
    sents_batch, doc_lens_batch = stack_and_pad_tensors([
        torch.stack([pad_tensor(sent, max_sent_len) for sent in doc["sents"]])
        for doc in batch
    ])
    tags_batch, _ = stack_and_pad_tensors([doc["tags"] for doc in batch])

    # Move to device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    sents_batch = sents_batch.to(device)
    tags_batch = tags_batch.to(device)

    if "encoding" in batch[0].keys():  # add doc encoding if applicable
        encoding_batch = torch.stack([doc["encoding"]
                                      for doc in batch]).to(device)
        return (sents_batch, tags_batch, encoding_batch)

    # return (word_ids_batch, seq_len_batch, label_batch)
    return (sents_batch, tags_batch, None)
예제 #12
0
    def batch_encode(self, iterator, dim=0, **kwargs) -> (torch.Tensor, torch.Tensor):
        """
        :param iterator (iterator): Batch of text to encode.
        :param dim (int, optional): Dimension along which to concatenate tensors.
        :param **kwargs: Keyword arguments passed to 'encode'.

        Returns
            torch.Tensor, torch.Tensor: Encoded and padded batch of sequences; Original lengths of
                sequences.
        """
        return stack_and_pad_tensors(
            Encoder.batch_encode(self, iterator, **kwargs),
            padding_index=self.padding_index,
            dim=dim,
        )
예제 #13
0
    def prepare_sample(self,
                       sample: list,
                       prepare_target: bool = True) -> (dict, dict):
        """
        Function that prepares a sample to input the model.
        :param sample: list of dictionaries.
        
        Returns:
            - dictionary with the expected model inputs.
            - dictionary with the expected target values.
        """
        sample = collate_tensors(sample)
        inputs = self.encoder.prepare_sample(sample["text"], trackpos=True)
        if not prepare_target:
            return inputs, {}

        tags, _ = stack_and_pad_tensors(
            [
                self.label_encoder.batch_encode(tags.split())
                for tags in sample["tags"]
            ],
            padding_index=self.label_encoder.vocab_size,
        )

        if self.hparams.ignore_first_title:
            first_tokens = tags[:, 0].clone()
            tags[:, 0] = first_tokens.masked_fill_(
                first_tokens == self._label_encoder.token_to_index["T"],
                self.label_encoder.vocab_size,
            )

        # TODO is this still needed ?
        if self.hparams.ignore_last_tag:
            lengths = [len(tags.split()) for tags in sample["tags"]]
            lengths = np.asarray(lengths)
            k = 0
            for length in lengths:
                if tags[k][length - 1] == 1:
                    tags[k][length - 1] = self.label_encoder.vocab_size
                k += 1

        targets = {"tags": tags}
        return inputs, targets
예제 #14
0
def test_stack_and_pad_tensors__dim():
    batch_size = 3
    batch = [
        torch.LongTensor([1, 2, 3, 4]),
        torch.LongTensor([1, 2, 3]),
        torch.LongTensor([1, 2])
    ]
    padded, lengths = stack_and_pad_tensors(batch,
                                            DEFAULT_PADDING_INDEX,
                                            dim=1)
    assert padded.shape == (4, batch_size)
    assert lengths.shape == (1, batch_size)
    assert lengths.tolist() == [[4, 3, 2]]
    assert padded.tolist() == [[1, 1, 1], [2, 2, 2],
                               [3, 3, DEFAULT_PADDING_INDEX],
                               [
                                   4, DEFAULT_PADDING_INDEX,
                                   DEFAULT_PADDING_INDEX
                               ]]
예제 #15
0
파일: model.py 프로젝트: mrvoh/HA-CapsNet
    def forward(self, sents):

        # for support of multi-gpu
        n_doc, n_sents, sen_len = sents.size()
        sents = sents.view(-1, sen_len)

        sen_encodings, word_attn_weight = self.sent_encoder(sents)

        sen_encodings = sen_encodings.split(split_size=[n_sents] * n_doc)
        # stack and pad
        sen_encodings, _ = stack_and_pad_tensors(sen_encodings)
        # get predictions
        y_pred, sent_attn_weight = self.doc_encoder(sen_encodings)
        return (
            y_pred,
            word_attn_weight,
            sent_attn_weight,
            0,
        )  # return 0 as reconstruction loss for caps nets
예제 #16
0
파일: model.py 프로젝트: mrvoh/HA-CapsNet
    def forward(self, sents):

        n_doc, n_sents, sen_len = sents.size()
        sents = sents.view(-1, sen_len)

        sen_encodings, word_attn_weight = self.sent_encoder(sents)

        sen_encodings = sen_encodings.split(split_size=[n_sents] * n_doc)
        # stack and pad
        sen_encodings, _ = stack_and_pad_tensors(sen_encodings)  #
        # get predictions
        doc_encoding, sent_attn_weight = self.doc_encoder(sen_encodings)

        doc_encoding = self.drop(self.bn(doc_encoding))

        y_pred = self.out(doc_encoding)

        return (
            y_pred,
            word_attn_weight,
            sent_attn_weight,
            0,
        )  # return 0 as reconstruction loss for caps nets
예제 #17
0
def load_data(dict_fn,
              data_fn,
              batch_size,
              start_sign,
              end_sign,
              checkpoint_dir,
              max_length,
              max_train_data_size=0):
    """
    数据加载方法,主要将分词好的数据进行整理,过程中保存字典文件,方便后续其他功能
    使用,方法返回处理好的dataset,steps_per_epoch,checkpoint_prefix
    Args:
        dict_fn: 将训练数据的字典保存,用于以后使用,路径
        data_fn: 分词好的训练数据路径
        batch_size: batch大小
        start_sign: 开始标记
        end_sign: 结束标记
        checkpoint_dir: 检查点保存路径
        max_length: 最大句子长度
        max_train_data_size: 最大训练数据大小
    Returns:
        dataset: PyTorch的DataLoader
        steps_per_epoch: 每轮的步数
        checkpoint_prefix: 保存检查点的前缀
    """
    print("训练数据读取中...")
    (input_lang,
     target_lang), diag_weight = read_tokenized_data(data_fn, start_sign,
                                                     end_sign,
                                                     max_train_data_size)
    diag_weight = torch.tensor(diag_weight, dtype=torch.float32)
    # 合并input,target用于生成统一的字典
    lang = np.hstack((input_lang, target_lang))
    print("读取完成,正在格式化训练数据...")
    tokenizer = StaticTokenizerEncoder(sample=lang,
                                       tokenize=lambda x: x.split())
    # 将文本序列转换文token id之后,并进行填充
    input_data = [
        pad_tensor(tensor=tokenizer.encode(example)[:max_length],
                   length=max_length,
                   padding_index=0) for example in input_lang
    ]
    target_data = [
        pad_tensor(tensor=tokenizer.encode(example)[:max_length],
                   length=max_length,
                   padding_index=0) for example in target_lang
    ]
    input_tensor = stack_and_pad_tensors(input_data)[0]
    target_tensor = stack_and_pad_tensors(target_data)[0]

    print("格式化完成,正在整理训练数据并保存字典")
    word_index = {}
    vocab_list = tokenizer.vocab
    for i in range(tokenizer.vocab_size):
        word_index[vocab_list[i]] = i
        word_index[i] = vocab_list[i]

    with open(dict_fn, 'w', encoding='utf-8') as file:
        file.write(json.dumps(word_index, indent=4, ensure_ascii=False))
    print("数据字典保存完成!")

    dataset = PairDataset(input_tensor, target_tensor, diag_weight)
    loader = DataLoader(dataset=dataset,
                        batch_size=batch_size,
                        shuffle=True,
                        num_workers=2)
    steps_per_epoch = len(input_tensor) // batch_size

    return loader, steps_per_epoch