def collate_fn(self, batch, padding=True): """ Collate function needs to be passed to the pytorch dataloader Returns: (title,title_lengths): tuple containing padded sequence tensor for title and sequence lengths (toc,toc_lengths): tuple containing padded sequence tensor for table of contents and sequence lengths (intro,intro_lengths): tuple containing padded sequence tensor for introduction and sequence lengths labels: tensor containing labels for the batch """ if self.mode == 'train': title, toc, intro, labels = zip(*batch) labels = torch.cat(labels) else: title, toc, intro = zip(*batch) if isinstance(intro, collections.Sequence): if padding: title, title_lengths = stack_and_pad_tensors(title) toc, toc_lengths = stack_and_pad_tensors(toc) intro, intro_lengths = stack_and_pad_tensors(intro) if self.mode == 'train': return (title, title_lengths), (toc, toc_lengths), (intro, intro_lengths), labels else: return (title, title_lengths), (toc, toc_lengths), (intro, intro_lengths) else: return batch
def collate_fn_rnn(batch): # # PyTorch RNN requires batches to be transposed for speed and integration with CUDA transpose = lambda b: b.t_().squeeze(0).contiguous() # Shape tensors in right format sents_len_batch = [[len(sent) for sent in doc["sents"]] for doc in batch] max_sent_len = max([max(s) for s in sents_len_batch]) sents_batch, doc_lens_batch = stack_and_pad_tensors([ torch.stack([pad_tensor(sent, max_sent_len) for sent in doc["sents"]]) for doc in batch ]) tags_batch, _ = stack_and_pad_tensors([doc["tags"] for doc in batch]) # Move to device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") sents_batch = sents_batch.to(device) tags_batch = tags_batch.to(device) if "encoding" in batch[0].keys(): # add doc encoding if applicable encoding_batch = torch.stack([doc["encoding"] for doc in batch]).to(device) return (sents_batch, tags_batch, encoding_batch) # return (word_ids_batch, seq_len_batch, label_batch) return (sents_batch, tags_batch, None)
def collate_fn_transformer1(batch): # test = [sent for doc in batch for sent in doc['sents']] sents_batch, sents_len_batch = stack_and_pad_tensors( [sent for doc in batch for sent in doc["sents"]]) doc_lens_batch = [len(doc["sents"]) for doc in batch] # tokens_batch, _ = stack_and_pad_tensors([doc['tokens'] for doc in batch]) tags_batch, _ = stack_and_pad_tensors([doc["tags"] for doc in batch]) # sents_len_batch = stack_and_pad_tensors([doc['sen_lens'] for doc in batch]) # word_len_batch, _ = stack_and_pad_tensors([seq['word_len'] for seq in batch]) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") sents_batch = sents_batch.to(device) sents_len_batch = sents_len_batch.to(device) # doc_lens_batch = doc_lens_batch.to(device) tags_batch = tags_batch.to(device) if "encoding" in batch[0].keys(): encoding_batch = torch.stack([doc["encoding"] for doc in batch]).to(device) return ( sents_batch, sents_len_batch, doc_lens_batch, tags_batch, encoding_batch, ) # return (word_ids_batch, seq_len_batch, label_batch) return (sents_batch, sents_len_batch, doc_lens_batch, tags_batch)
def collate_fn(batch, train=True): """ list of tensors to a batch tensors """ premise_batch, _ = stack_and_pad_tensors([row['premise'] for row in batch]) hypothesis_batch, _ = stack_and_pad_tensors( [row['hypothesis'] for row in batch]) label_batch = torch.stack([row['label'] for row in batch]) # PyTorch RNN requires batches to be transposed for speed and integration with CUDA transpose = (lambda b: b.t_().squeeze(0).contiguous()) return (transpose(premise_batch), transpose(hypothesis_batch), transpose(label_batch))
def batch_encode(self, iterator): ids, bounds, strict_masks, number_of_tokens = list( zip(*[self.encode(example) for example in iterator]) ) batch = stack_and_pad_tensors(ids, padding_index=self.padding_index, dim=0) bounds_batch = stack_and_pad_tensors(bounds, padding_index=-1, dim=0) masks_batch = stack_and_pad_tensors(strict_masks, padding_index=False, dim=0) number_of_tokens_batch = torch.tensor(number_of_tokens, dtype=torch.int) return BatchedSentence( tensor=batch.tensor, lengths=batch.lengths, bounds=bounds_batch.tensor, bounds_lengths=bounds_batch.lengths, strict_masks=masks_batch.tensor, number_of_tokens=number_of_tokens_batch, )
def collate_fn_eval_base(batch): """ list of tensors to a batch tensors """ word_ids_batch, _ = stack_and_pad_tensors( [seq['word_ids'] for seq in batch]) label_batch, _ = stack_and_pad_tensors([seq['labels'] for seq in batch]) seq_len_batch = torch.LongTensor([len(seq['word_ids']) for seq in batch]) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') word_ids_batch = word_ids_batch.to(device) seq_len_batch = seq_len_batch.to(device) label_batch = label_batch.to(device) # PyTorch RNN requires batches to be transposed for speed and integration with CUDA transpose = (lambda b: b.t_().squeeze(0).contiguous()) # return (word_ids_batch, seq_len_batch, label_batch) return (transpose(word_ids_batch), seq_len_batch, transpose(label_batch))
def collate_fn_infer(batch): """ list of tensors to a batch tensors """ batch, _ = stack_and_pad_tensors([row['word_ids'] for row in batch]) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') batch = batch.to(device) # PyTorch RNN requires batches to be transposed for speed and integration with CUDA transpose = (lambda b: b.t_().squeeze(0).contiguous()) return transpose(batch)
def test_stack_and_pad_tensors(): batch = [ torch.LongTensor([1, 2, 3]), torch.LongTensor([1, 2]), torch.LongTensor([1]) ] padded, lengths = stack_and_pad_tensors(batch, DEFAULT_PADDING_INDEX) padded = [r.tolist() for r in padded] assert padded == [[1, 2, 3], [1, 2, DEFAULT_PADDING_INDEX], [1, DEFAULT_PADDING_INDEX, DEFAULT_PADDING_INDEX]] assert lengths.tolist() == [3, 2, 1]
def batch_encode_trackpos(self, iterator, dim=0, **kwargs) -> (torch.Tensor, torch.Tensor): """ :param iterator (iterator): Batch of text to encode. :param dim (int, optional): Dimension along which to concatenate tensors. :param **kwargs: Keyword arguments passed to 'encode'. Returns torch.Tensor, torch.Tensor: Encoded and padded batch of sequences; Original lengths of sequences. """ sequences, tags = zip( *[self.encode_trackpos(object_) for object_ in iterator]) sequences, seq_lengths = stack_and_pad_tensors( sequences, padding_index=self.padding_index, dim=dim) tag_idxs, tag_lengths = stack_and_pad_tensors(tags, padding_index=0, dim=dim) return sequences, seq_lengths, tag_idxs, tag_lengths
def preprocess_request(sentence, start_sign, end_sign, token, max_length): sentence = " ".join(jieba.cut(sentence)) sentence = preprocess_sentence(start_sign, end_sign, sentence) inputs = [token.get(i, 3) for i in sentence.split(' ')] inputs = torch.tensor(inputs) inputs = [ pad_tensor(tensor=inputs[:max_length], length=max_length, padding_index=0) ] inputs = stack_and_pad_tensors(inputs)[0] dec_input = torch.unsqueeze(torch.tensor([token[start_sign]]), 0) return inputs, dec_input
def collate_fn_transformer(batch): # multigpu implementation # Shape tensors in right format sents_len_batch = [[len(sent) for sent in doc["sents"]] for doc in batch] max_sent_len = max([max(s) for s in sents_len_batch]) sents_batch, doc_lens_batch = stack_and_pad_tensors([ torch.stack([pad_tensor(sent, max_sent_len) for sent in doc["sents"]]) for doc in batch ]) tags_batch, _ = stack_and_pad_tensors([doc["tags"] for doc in batch]) # Move to device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") sents_batch = sents_batch.to(device) tags_batch = tags_batch.to(device) if "encoding" in batch[0].keys(): # add doc encoding if applicable encoding_batch = torch.stack([doc["encoding"] for doc in batch]).to(device) return (sents_batch, tags_batch, encoding_batch) # return (word_ids_batch, seq_len_batch, label_batch) return (sents_batch, tags_batch, None)
def batch_encode(self, iterator, dim=0, **kwargs) -> (torch.Tensor, torch.Tensor): """ :param iterator (iterator): Batch of text to encode. :param dim (int, optional): Dimension along which to concatenate tensors. :param **kwargs: Keyword arguments passed to 'encode'. Returns torch.Tensor, torch.Tensor: Encoded and padded batch of sequences; Original lengths of sequences. """ return stack_and_pad_tensors( Encoder.batch_encode(self, iterator, **kwargs), padding_index=self.padding_index, dim=dim, )
def prepare_sample(self, sample: list, prepare_target: bool = True) -> (dict, dict): """ Function that prepares a sample to input the model. :param sample: list of dictionaries. Returns: - dictionary with the expected model inputs. - dictionary with the expected target values. """ sample = collate_tensors(sample) inputs = self.encoder.prepare_sample(sample["text"], trackpos=True) if not prepare_target: return inputs, {} tags, _ = stack_and_pad_tensors( [ self.label_encoder.batch_encode(tags.split()) for tags in sample["tags"] ], padding_index=self.label_encoder.vocab_size, ) if self.hparams.ignore_first_title: first_tokens = tags[:, 0].clone() tags[:, 0] = first_tokens.masked_fill_( first_tokens == self._label_encoder.token_to_index["T"], self.label_encoder.vocab_size, ) # TODO is this still needed ? if self.hparams.ignore_last_tag: lengths = [len(tags.split()) for tags in sample["tags"]] lengths = np.asarray(lengths) k = 0 for length in lengths: if tags[k][length - 1] == 1: tags[k][length - 1] = self.label_encoder.vocab_size k += 1 targets = {"tags": tags} return inputs, targets
def test_stack_and_pad_tensors__dim(): batch_size = 3 batch = [ torch.LongTensor([1, 2, 3, 4]), torch.LongTensor([1, 2, 3]), torch.LongTensor([1, 2]) ] padded, lengths = stack_and_pad_tensors(batch, DEFAULT_PADDING_INDEX, dim=1) assert padded.shape == (4, batch_size) assert lengths.shape == (1, batch_size) assert lengths.tolist() == [[4, 3, 2]] assert padded.tolist() == [[1, 1, 1], [2, 2, 2], [3, 3, DEFAULT_PADDING_INDEX], [ 4, DEFAULT_PADDING_INDEX, DEFAULT_PADDING_INDEX ]]
def forward(self, sents): # for support of multi-gpu n_doc, n_sents, sen_len = sents.size() sents = sents.view(-1, sen_len) sen_encodings, word_attn_weight = self.sent_encoder(sents) sen_encodings = sen_encodings.split(split_size=[n_sents] * n_doc) # stack and pad sen_encodings, _ = stack_and_pad_tensors(sen_encodings) # get predictions y_pred, sent_attn_weight = self.doc_encoder(sen_encodings) return ( y_pred, word_attn_weight, sent_attn_weight, 0, ) # return 0 as reconstruction loss for caps nets
def forward(self, sents): n_doc, n_sents, sen_len = sents.size() sents = sents.view(-1, sen_len) sen_encodings, word_attn_weight = self.sent_encoder(sents) sen_encodings = sen_encodings.split(split_size=[n_sents] * n_doc) # stack and pad sen_encodings, _ = stack_and_pad_tensors(sen_encodings) # # get predictions doc_encoding, sent_attn_weight = self.doc_encoder(sen_encodings) doc_encoding = self.drop(self.bn(doc_encoding)) y_pred = self.out(doc_encoding) return ( y_pred, word_attn_weight, sent_attn_weight, 0, ) # return 0 as reconstruction loss for caps nets
def load_data(dict_fn, data_fn, batch_size, start_sign, end_sign, checkpoint_dir, max_length, max_train_data_size=0): """ 数据加载方法,主要将分词好的数据进行整理,过程中保存字典文件,方便后续其他功能 使用,方法返回处理好的dataset,steps_per_epoch,checkpoint_prefix Args: dict_fn: 将训练数据的字典保存,用于以后使用,路径 data_fn: 分词好的训练数据路径 batch_size: batch大小 start_sign: 开始标记 end_sign: 结束标记 checkpoint_dir: 检查点保存路径 max_length: 最大句子长度 max_train_data_size: 最大训练数据大小 Returns: dataset: PyTorch的DataLoader steps_per_epoch: 每轮的步数 checkpoint_prefix: 保存检查点的前缀 """ print("训练数据读取中...") (input_lang, target_lang), diag_weight = read_tokenized_data(data_fn, start_sign, end_sign, max_train_data_size) diag_weight = torch.tensor(diag_weight, dtype=torch.float32) # 合并input,target用于生成统一的字典 lang = np.hstack((input_lang, target_lang)) print("读取完成,正在格式化训练数据...") tokenizer = StaticTokenizerEncoder(sample=lang, tokenize=lambda x: x.split()) # 将文本序列转换文token id之后,并进行填充 input_data = [ pad_tensor(tensor=tokenizer.encode(example)[:max_length], length=max_length, padding_index=0) for example in input_lang ] target_data = [ pad_tensor(tensor=tokenizer.encode(example)[:max_length], length=max_length, padding_index=0) for example in target_lang ] input_tensor = stack_and_pad_tensors(input_data)[0] target_tensor = stack_and_pad_tensors(target_data)[0] print("格式化完成,正在整理训练数据并保存字典") word_index = {} vocab_list = tokenizer.vocab for i in range(tokenizer.vocab_size): word_index[vocab_list[i]] = i word_index[i] = vocab_list[i] with open(dict_fn, 'w', encoding='utf-8') as file: file.write(json.dumps(word_index, indent=4, ensure_ascii=False)) print("数据字典保存完成!") dataset = PairDataset(input_tensor, target_tensor, diag_weight) loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=2) steps_per_epoch = len(input_tensor) // batch_size return loader, steps_per_epoch