示例#1
0
 def prepare_dataset(self):
     dat_obj = PreProcess()
     dat_obj.prepare_dataset()
     test_df = dat_obj.test_df2
     test_dataset = SentimentDataset(test_df, max_length=100, mode='test')
     self.test_loader = DataLoader(test_dataset,
                                   batch_size=1,
                                   num_workers=0,
                                   shuffle=True)
 def prepare_dataset(self):
     dat_obj = PreProcess()
     dat_obj.prepare_dataset()
     train_df = dat_obj.train_df
     val_df = dat_obj.val_df
     test_df = dat_obj.test_df1
     train_dataset = SentimentDataset(train_df, max_length=100)
     val_dataset = SentimentDataset(val_df, max_length=100)
     test_dataset = SentimentDataset(test_df, max_length=100, mode='test')
     self.train_loader = DataLoader(train_dataset, batch_size=32, num_workers=0, shuffle=True)
     self.val_loader = DataLoader(val_dataset, batch_size=32, num_workers=0, shuffle=True)
     self.test_loader = DataLoader(test_dataset, batch_size=32, num_workers=0, shuffle=True)
    def __getitem__(self, index):
        row = self.df.iloc[index]
        text, label = row['pre_process'], row[0]
        if label != 0:
            label = 1
        out_dict = self.tokenizer.encode_plus(text=text,
                                              padding='max_length',
                                              max_length=200,
                                              return_tensors='pt')
        # print(out_dict)
        if self.mode != 'test':
            return [(out_dict['input_ids'][:, :self.max_length], out_dict['attention_mask'][:, :self.max_length]), label]
        else:
            return [text, (out_dict['input_ids'][:, :self.max_length], out_dict['attention_mask'][:, :self.max_length]), label]

    def __len__(self):
        # return int(self.df.shape[0])
        return 2000

if __name__ == '__main__':
    from pre_process import PreProcess
    dat_obj = PreProcess()
    dat_obj.prepare_dataset()
    train_df = dat_obj.train_df
    dataset = SentimentDataset(train_df, 200)
    train_loader = DataLoader(dataset, batch_size=5,num_workers=8)

    for i, j in enumerate(train_loader,0):
        print(i)
        print(j[0][0])