def get_dataloader(train_data, valid_data):
    train_dataset = PairDataset(train_data, num_neg=0)
    valid_dataset = PairDataset(valid_data, num_neg=0)
    padding = MultiQAPadding(fixed_length_uttr=fixed_length_uttr,
                             fixed_length_resp=fixed_length_resp,
                             fixed_length_turn=fixed_length_turn,
                             data_type=data_type)

    train_dataloader = DictDataLoader(train_dataset,
                                      batch_size=batch_size,
                                      turns=fixed_length_turn,
                                      stage='train',
                                      shuffle=True,
                                      sort=False,
                                      callback=padding)
    valid_dataloader = DictDataLoader(valid_dataset,
                                      batch_size=batch_size,
                                      turns=fixed_length_turn,
                                      stage='dev',
                                      shuffle=False,
                                      sort=False,
                                      callback=padding)

    for i, (x, y) in enumerate(train_dataloader):
        # 打印Utterance的形状
        logger.info(f"The shape of utternace is {x[constants.UTTRS].shape}")
        if i == 0:
            break
    return train_dataloader, valid_dataloader
def get_dataloader(data):
    dataset = PairDataset(data, num_neg=0)
    padding = MultiQAPadding(fixed_length_uttr=fixed_length_uttr,
                             fixed_length_resp=fixed_length_resp,
                             fixed_length_turn=fixed_length_turn)
    dataloader = DictDataLoader(dataset,
                                batch_size=batch_size,
                                turns=fixed_length_turn,
                                stage='dev',
                                shuffle=False,
                                sort=False,
                                callback=padding)
    return dataloader
    def get_dataloader(self, utterance: str, responses: list):
        data = pd.DataFrame()
        turns = len(utterance.split("\t"))
        data[constants.RESP] = responses
        data[constants.UTTRS] = utterance
        data[constants.TURNS] = turns
        data[constants.LABEL] = 0
        data = self.preprocessor.transform(data, drop=False)

        dataset = PairDataset(data, num_neg=0)
        padding = MultiQAPadding(self.uttr_len,
                                 self.resp_len,
                                 self.turns,
                                 data_type=self.data_type)
        dataloader = DictDataLoader(dataset,
                                    batch_size=len(dataset),
                                    turns=self.turns,
                                    stage="test",
                                    device=self.device,
                                    shuffle=False,
                                    sort=False,
                                    callback=padding)
        return dataloader
data['label'] =  data['label'].astype(int)

# 划分训练集和测试集
train = data[:90]
valid = data[90:]

# 加载预训练词向量
basename = "/home/speech/models"
# 构建词向量矩阵
logger.info("读取词向量文件")
word_embedding = load_from_file(Path(basename) / "500000-small.txt")
embedding_matrix = word_embedding.build_matrix(preprocessor.context['term_index'])

# 对训练集和验证集进行封装
logger.info("使用Dataset和DataLoader对数据进行封装")
train_dataset = PairDataset(train, num_neg=0)
valid_dataset = PairDataset(valid, num_neg=0)
padding = MultiQAPadding(fixed_length_uttr=fixed_length_uttr, fixed_length_resp=fixed_length_resp,
                         fixed_length_turn=fixed_length_turn)

train_dataloader = DictDataLoader(train_dataset, batch_size=16,
                                  turns=fixed_length_turn,
                                  shuffle=False,
                                  sort=False,
                                  callback=padding)
valid_dataloader = DictDataLoader(valid_dataset, batch_size=16,
                                  turns=fixed_length_turn,
                                  shuffle=False,
                                  sort=False,
                                  callback=padding)
示例#5
0
## --------------------- 01 测试预处理类 --------------------------

print("对数据进行预处理...")
preprocessor = CNAlbertPreprocessorForMultiQA(Path(albert_path) / vocab_file,
                                              uttr_len=fixed_length_uttr,
                                              resp_len=fixed_length_resp)
data = preprocessor.transform(data)

data = data[[
    'D_num', 'turns', 'utterances', 'response', 'utterances_len',
    'response_len'
]]
data['label'] = 1

## --------------------- 02 封装数据 --------------------------
dataset = PairDataset(data, num_neg=0)
padding = MultiQAPadding(fixed_length_uttr=fixed_length_uttr,
                         fixed_length_resp=fixed_length_resp,
                         fixed_length_turn=fixed_length_turn)
dataloader = DictDataLoader(dataset,
                            batch_size=batch_size,
                            turns=fixed_length_turn,
                            shuffle=False,
                            sort=False,
                            callback=padding)

## -------------------- 03 定义模型并前向传播 -----------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"当前正在测试的模型 {name.upper()}")
print("定义模型和参数...")