예제 #1
0
 def prepare_raw_data(self, index):
     # Load label
     d = self.data[index]
     x, y = zip(*d)
     y = target_padding(y, max([len(v) for v in y]))
     # Load acoustic feature and pad
     x = [torchaudio.load(os.path.join(self.raw_root,f)).squeeze(0) for f in x]
     return x,y
예제 #2
0
 def prepare_raw_data(self, index):
     # Load label
     d = self.data[index]
     x, y = zip(*d)
     y = target_padding(y, max([len(v) for v in y]))
     if self.text_only:
         return y
     
     # Load acoustic feature and pad
     x = [self.get_raw_data_folder(_x) for _x in x]
     x = [torchaudio.load(os.path.join(self.raw_root,f[:-4]+'.flac'))[0].squeeze(0) for f in x]
     return x,y
예제 #3
0
 def prepare_data(self, index):
     # Load label
     d = self.data[index]
     x, y = zip(*d)
     y = target_padding(y, max([len(v) for v in y]))
     if self.text_only:
         return y
     
     # Load acoustic feature and pad
     x = [torch.FloatTensor(np.load(os.path.join(self.root,f))) for f in x]
     x = pad_sequence(x, batch_first=True)
     return x,y
예제 #4
0
    def __init__(self, file_path, raw_file_path, sets, bucket_size, max_timestep=0, max_label_len=0, raw_wav_data=False):
        self.raw_root = raw_file_path
        self.raw_wav_data = raw_wav_data
        # Open dataset
        x = []
        y = []
        tables = []
        for s in sets:
            with open(os.path.join(file_path,s+'_x.pkl'),'rb') as fp:
                x += pickle.load(fp)
            with open(os.path.join(file_path,s+'_y.pkl'),'rb') as fp:
                y += pickle.load(fp)
            # load data path
            tables.append(pd.read_csv(os.path.join(file_path,s+'.csv')))
        assert len(x)==len(y)
        
        # Sort data w.r.t. length
        self.X = []
        self.Y = []
        sortd_len = [len(t) for t in x]
        sorted_x = [x[idx] for idx in reversed(np.argsort(sortd_len))]
        sorted_y = [y[idx] for idx in reversed(np.argsort(sortd_len))]
        self.table = pd.concat(tables,ignore_index=True).sort_values(by=['length'],ascending=False)

        # Bucketing
        for b in range(int(np.ceil(len(sorted_x)/bucket_size))):
            offset = b*bucket_size
            bound = min((b+1)*bucket_size,len(sorted_x))
            bucket_max_timestep = min(max_timestep,len(sorted_x[offset]))
            self.X.append(zero_padding(sorted_x[offset:bound], bucket_max_timestep))
            bucket_max_label_len = min(max_label_len,max([len(v) for v in sorted_y[offset:bound]]))
            self.Y.append(target_padding(sorted_y[offset:bound], bucket_max_label_len))
        self.iterator = self.prepare_feature if not raw_wav_data else self.prepara_raw_data

        # bucketing data
        X = self.table['file_path'].tolist()
        X_lens = self.table['length'].tolist()
        Y = [list(map(int, label.split('_'))) for label in self.table['label'].tolist()]

        # Bucketing, X & X_len is dummy when text_only==True
        self.data = bucket_data(X_lens, list(zip(X,Y)), bucket_size)
        self.transorm = torchaudio.transforms.MFCC(sample_rate=16000, n_mfcc=40, dct_type=2, 
                                                    melkwargs={'hop_length':int(16*10), 'n_fft':int(16*25)})