示例#1
0
def collate_fn_rnn(batch):
    # # PyTorch RNN requires batches to be transposed for speed and integration with CUDA
    transpose = lambda b: b.t_().squeeze(0).contiguous()

    # Shape tensors in right format
    sents_len_batch = [[len(sent) for sent in doc["sents"]] for doc in batch]
    max_sent_len = max([max(s) for s in sents_len_batch])
    sents_batch, doc_lens_batch = stack_and_pad_tensors([
        torch.stack([pad_tensor(sent, max_sent_len) for sent in doc["sents"]])
        for doc in batch
    ])
    tags_batch, _ = stack_and_pad_tensors([doc["tags"] for doc in batch])

    # Move to device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    sents_batch = sents_batch.to(device)
    tags_batch = tags_batch.to(device)

    if "encoding" in batch[0].keys():  # add doc encoding if applicable
        encoding_batch = torch.stack([doc["encoding"]
                                      for doc in batch]).to(device)
        return (sents_batch, tags_batch, encoding_batch)

    # return (word_ids_batch, seq_len_batch, label_batch)
    return (sents_batch, tags_batch, None)
示例#2
0
def parse_dataset(path,
                  label_to_idx,
                  word_to_idx,
                  pos_target=False,
                  pad_len=None,
                  encoding='latin-1',
                  max_len=100):
    sentences = []
    UNK = 3
    PAD = 1
    target_index = 1 if pos_target else 3
    nr_long = 0
    max_sus = 0
    with open(path, encoding=encoding) as f:

        sample = {'word_ids': [], 'labels': []}
        max_len_token = 0
        for line in f.read().splitlines():
            if line in ['\n', '\r\n', '']:  # end of sequence
                if len(sample['labels']) > 100:
                    nr_long += 1
                if (len(sample['labels']) > 0) and (len(sample['word_ids']) <
                                                    max_len):
                    max_sus = max(max_sus, len(sample['word_ids']))
                    sample['labels'] = torch.LongTensor(sample['labels'])
                    sentences.append(sample)
                sample = {'word_ids': [], 'labels': []}
                continue
            else:
                ls = line.split()
                max_len_token = max(max_len_token, len(ls[4:]))
                word = ls[4:]
                label = ls[target_index]
                if len(word) > 0:
                    word_ids = [
                        word_to_idx[w] if w in word_to_idx.keys() else UNK
                        for w in word
                    ]
                    sample['word_ids'].append(
                        torch.LongTensor(word_ids))  # 3 -> <unk>
                    sample['labels'].append(label_to_idx[label])
                    if len(word_ids) > 20:
                        print(line)

    # padd all BPE encodings to max length in dataset
    if pad_len is not None:
        max_len_token = max(pad_len, max_len_token)
    for s in range(len(sentences)):
        sen = sentences[s]
        for i in range(len(sen['word_ids'])):
            sen['word_ids'][i] = pad_tensor(sen['word_ids'][i],
                                            length=max_len_token,
                                            padding_index=PAD)

        # stack word ids back together
        sen['word_ids'] = torch.stack(sen['word_ids'], dim=0).view(-1)
    print('max nr of SUs in sentence: {}'.format(max_sus))
    print('Number of long sentences: {}'.format(nr_long))

    return Dataset(sentences), max_len_token
示例#3
0
 def generate_encodings(self, data, labels):
     encoder = StaticTokenizerEncoder(data,
                                      tokenize=lambda s: s.split(),
                                      min_occurrences=3)
     encoded_data = [encoder.encode(document) for document in data]
     encoded_data = [pad_tensor(x, length=10000) for x in encoded_data]
     data = {'labels': labels, 'inputs': encoded_data}
     return pd.DataFrame(data=data)
示例#4
0
def tokenize_pos_tags(X_tags, tag_to_index, max_sen_len=800):
    """
    One hot encodes pos tags
    :param X_tags:
    :param tag_to_index:
    :param max_sen_len:
    :return: One hot encoded vector
    """
    return torch.nn.functional.one_hot(
        torch.stack([pad_tensor(torch.LongTensor(lst), max_sen_len) for lst in X_tags]),
        num_classes=max(tag_to_index.values()) + 1,
    )
示例#5
0
def preprocess_request(sentence, start_sign, end_sign, token, max_length):
    sentence = " ".join(jieba.cut(sentence))
    sentence = preprocess_sentence(start_sign, end_sign, sentence)
    inputs = [token.get(i, 3) for i in sentence.split(' ')]
    inputs = torch.tensor(inputs)
    inputs = [
        pad_tensor(tensor=inputs[:max_length],
                   length=max_length,
                   padding_index=0)
    ]
    inputs = stack_and_pad_tensors(inputs)[0]
    dec_input = torch.unsqueeze(torch.tensor([token[start_sign]]), 0)

    return inputs, dec_input
示例#6
0
def pad_and_stack_list_of_list(
    list_of_list: list, max_sentence_len=800, pad_value=0, tensor_type=torch.FloatTensor
):
    """

    :param list_of_list: list of list of sequence
    :param max_sentence_len: defaults to 800
    :param pad_value: defaults to 0
    :param tensor_type: defaults to torch.FloatTensor
    :return: stacked tensor
    """
    padded = [
        pad_tensor(tensor_type(lst), length=max_sentence_len, padding_index=pad_value)
        for lst in list_of_list
    ]
    stacked = torch.stack(padded)
    return stacked
示例#7
0
def collate_fn_transformer(batch):  # multigpu implementation

    # Shape tensors in right format
    sents_len_batch = [[len(sent) for sent in doc["sents"]] for doc in batch]
    max_sent_len = max([max(s) for s in sents_len_batch])
    sents_batch, doc_lens_batch = stack_and_pad_tensors([
        torch.stack([pad_tensor(sent, max_sent_len) for sent in doc["sents"]])
        for doc in batch
    ])
    tags_batch, _ = stack_and_pad_tensors([doc["tags"] for doc in batch])

    # Move to device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    sents_batch = sents_batch.to(device)
    tags_batch = tags_batch.to(device)

    if "encoding" in batch[0].keys():  # add doc encoding if applicable
        encoding_batch = torch.stack([doc["encoding"]
                                      for doc in batch]).to(device)
        return (sents_batch, tags_batch, encoding_batch)

    # return (word_ids_batch, seq_len_batch, label_batch)
    return (sents_batch, tags_batch, None)
示例#8
0
def load_data(dict_fn,
              data_fn,
              batch_size,
              start_sign,
              end_sign,
              checkpoint_dir,
              max_length,
              max_train_data_size=0):
    """
    数据加载方法,主要将分词好的数据进行整理,过程中保存字典文件,方便后续其他功能
    使用,方法返回处理好的dataset,steps_per_epoch,checkpoint_prefix
    Args:
        dict_fn: 将训练数据的字典保存,用于以后使用,路径
        data_fn: 分词好的训练数据路径
        batch_size: batch大小
        start_sign: 开始标记
        end_sign: 结束标记
        checkpoint_dir: 检查点保存路径
        max_length: 最大句子长度
        max_train_data_size: 最大训练数据大小
    Returns:
        dataset: PyTorch的DataLoader
        steps_per_epoch: 每轮的步数
        checkpoint_prefix: 保存检查点的前缀
    """
    print("训练数据读取中...")
    (input_lang,
     target_lang), diag_weight = read_tokenized_data(data_fn, start_sign,
                                                     end_sign,
                                                     max_train_data_size)
    diag_weight = torch.tensor(diag_weight, dtype=torch.float32)
    # 合并input,target用于生成统一的字典
    lang = np.hstack((input_lang, target_lang))
    print("读取完成,正在格式化训练数据...")
    tokenizer = StaticTokenizerEncoder(sample=lang,
                                       tokenize=lambda x: x.split())
    # 将文本序列转换文token id之后,并进行填充
    input_data = [
        pad_tensor(tensor=tokenizer.encode(example)[:max_length],
                   length=max_length,
                   padding_index=0) for example in input_lang
    ]
    target_data = [
        pad_tensor(tensor=tokenizer.encode(example)[:max_length],
                   length=max_length,
                   padding_index=0) for example in target_lang
    ]
    input_tensor = stack_and_pad_tensors(input_data)[0]
    target_tensor = stack_and_pad_tensors(target_data)[0]

    print("格式化完成,正在整理训练数据并保存字典")
    word_index = {}
    vocab_list = tokenizer.vocab
    for i in range(tokenizer.vocab_size):
        word_index[vocab_list[i]] = i
        word_index[i] = vocab_list[i]

    with open(dict_fn, 'w', encoding='utf-8') as file:
        file.write(json.dumps(word_index, indent=4, ensure_ascii=False))
    print("数据字典保存完成!")

    dataset = PairDataset(input_tensor, target_tensor, diag_weight)
    loader = DataLoader(dataset=dataset,
                        batch_size=batch_size,
                        shuffle=True,
                        num_workers=2)
    steps_per_epoch = len(input_tensor) // batch_size

    return loader, steps_per_epoch
示例#9
0
def test_pad_tensor():
    padded = pad_tensor(torch.LongTensor([1, 2, 3]), 5, DEFAULT_PADDING_INDEX)
    assert padded.tolist() == [
        1, 2, 3, DEFAULT_PADDING_INDEX, DEFAULT_PADDING_INDEX
    ]
示例#10
0
def test_pad_tensor_multiple_dim_float_tensor():
    padded = pad_tensor(torch.FloatTensor(778, 80), 804, DEFAULT_PADDING_INDEX)
    assert padded.size() == (804, 80)
    assert padded[-1].sum().item() == pytest.approx(0)
    assert padded.type() == 'torch.FloatTensor'
示例#11
0
def test_pad_tensor_multiple_dim():
    padded = pad_tensor(torch.LongTensor(1, 2, 3), 5, DEFAULT_PADDING_INDEX)
    assert padded.size() == (5, 2, 3)
    assert padded[1].sum().item() == pytest.approx(0)
    max_sentence_len=max_sentence_len,
    pad_value=-1,
    tensor_type=torch.FloatTensor,
)
ascii = pad_and_stack_list_of_list(
    ascii,
    max_sentence_len=max_sentence_len,
    pad_value=-1,
    tensor_type=torch.FloatTensor,
)

x_enriched_features = torch.stack(
    (alnum, numeric, alpha, digit, lower, title, ascii), dim=2)

x_encoded = [x_encoder.encode(text) for text in X_text_list]
x_padded = [pad_tensor(tensor, max_sentence_len) for tensor in x_encoded]
x_padded = torch.LongTensor(torch.stack(x_padded))

x_char_padded = [[
    pad_tensor(x_char_encoder.encode(char[:max_word_length]), max_word_length)
    for char in word
] for word in X_text_list_as_is]
x_char_padded = [
    pad_tensor(torch.stack(lst), max_sentence_len) for lst in x_char_padded
]
x_char_padded = torch.stack(x_char_padded).type(torch.LongTensor)

x_postag_padded = tokenize_pos_tags(X_tags,
                                    tag_to_index=tag_to_index,
                                    max_sen_len=max_sentence_len)
示例#13
0
    reviews = []
    labels = []

    # 丢弃过长的数据
    for i in tqdm(range(len(encoded_texts))):
        if len(encoded_texts[i]) < max_pad_length:
            reviews.append(encoded_texts[i])
            labels.append(1 if labels_as_list[i] == "positive" else 0)

    assert len(reviews) == len(
        labels), "The labels and feature lists should have the same time"

    # 扩充单个sequence到最大序列长度
    padded_dataset = []
    for i in tqdm(range(len(reviews))):
        padded_dataset.append(pad_tensor(reviews[i], int(max_pad_length)))

    # preparing the final dataset:
    X = torch.stack(padded_dataset)
    y = torch.tensor(labels)

    (y == 1).float().mean()

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=.25,
                                                        random_state=42)

    X_train, y_train = torch.tensor(X_train), torch.tensor(y_train)
    X_test, y_test = torch.tensor(X_test), torch.tensor(y_test)