def test_load_audio(self): indexes = np.arange(5) x_data_raw, y_data_raw, sr = load_audio(self.df, indexes_in_batch=indexes) self.assertEqual(len(x_data_raw), 5) self.assertEqual(len(y_data_raw), 5)
def test_extract_features_and_pad(self): indexes = np.arange(5) x_data_raw, y_data_raw, sr = load_audio(self.df, indexes_in_batch=indexes) x_data, input_length = self.dg.extract_features_and_pad(x_data_raw, sr) self.assertEqual(x_data.shape, (5, 382, 26)) self.assertEqual(len(input_length), 5) self.assertLessEqual(all(input_length), 382)
def test_convert_transcripts(self): _, y_data_raw, sr = load_audio(self.df, indexes_in_batch=[0]) transcript, y_length = convert_and_pad_transcripts(y_data_raw) exp = [ 23., 5., 18., 5., 0., 9., 0., 2., 21., 20., 0., 1., 12., 18., 5., 1., 4., 25., 0., 15., 14., 0., 20., 8., 5., 0., 3., 1., 18., 20. ] list = transcript[0].tolist() self.assertListEqual(list, exp) self.assertEqual(y_length, 30)
def test_extract_mel_spec(self): x_data_raw, _, sr = load_audio(self.df, indexes_in_batch=[0]) mel_spec, x_length = extract_mel_spectrogram_and_pad( x_data_raw[0], sr=sr, max_pad_length=500, frame_length=320, hop_length=160, n_mels=40) self.assertTupleEqual(mel_spec.shape, (500, 40)) self.assertEqual(x_length, 256)
def test_extract_mfcc(self): x_data_raw, _, sr = load_audio(self.df, indexes_in_batch=[0]) mfcc_padded, x_length = extract_mfcc_and_pad(x_data_raw[0], sr=sr, max_pad_length=500, frame_length=320, hop_length=160, mfcc_features=26, n_mels=40) self.assertTupleEqual(mfcc_padded.shape, (500, 26)) self.assertEqual(x_length, 256)
def __getitem__(self, batch_index): """ Generates a batch of correctly shaped X and Y data :param batch_index: index of the batch to generate :return: input dictionary containing: 'the_input': np.ndarray[shape=(batch_size, max_seq_length, mfcc_features)]: input audio data 'the_labels': np.ndarray[shape=(batch_size, max_transcript_length)]: transcription data 'input_length': np.ndarray[shape=(batch_size, 1)]: length of each sequence (numb of frames) in x_data 'label_length': np.ndarray[shape=(batch_size, 1)]: length of each sequence (numb of letters) in y_data output dictionary containing: 'ctc': np.ndarray[shape=(batch_size, 1)]: dummy data for dummy loss function """ # Generate indexes of current batch indexes_in_batch = self.indexes[batch_index * self.batch_size:(batch_index + 1) * self.batch_size] # Shuffle indexes within current batch if shuffle=true if self.shuffle: shuf(indexes_in_batch) # Load audio and transcripts x_data_raw, y_data_raw, sr = load_audio(self.df, indexes_in_batch) # Preprocess and pad data x_data, input_length = self.extract_features_and_pad(x_data_raw, sr) y_data, label_length = convert_and_pad_transcripts(y_data_raw) # print ("\nx_data shape: ", x_data.shape) # print ("y_data shape: ", y_data.shape) # print ("input_length shape: ", input_length.shape) # print ("label_length shape: ", label_length.shape) # print ("input length: ", input_length) # print ("label_length: ", label_length, "\n") inputs = { 'the_input': x_data, 'the_labels': y_data, 'input_length': input_length, 'label_length': label_length } outputs = { 'ctc': np.zeros([self.batch_size]) } # dummy data for dummy loss function return inputs, outputs
def test_get_seq_size(self): x_data_raw, _, sr = load_audio(self.df, indexes_in_batch=[0]) size = self.dg.get_seq_size(x_data_raw[0], sr) self.assertEqual(size, 256)