def preprocess_data(num_mfcc_coeffs, num_filters, window_len, window_step, max_num_frames): """Processes the training data and returns MFCC vectors for all of them. Args: config: the Config object with various parameters specified Returns: train_data: A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask) train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask) """ inputs = [] labels = [] SOURCE_DIR = '../data/cmu_arctic/us-english-female-slt/wav/' TARGET_DIR = '../data/cmu_arctic/us-english-male-bdl/wav/' index = 0 for source_fname, target_fname in zip(os.listdir(SOURCE_DIR), os.listdir(TARGET_DIR)): if index >= 20: break index += 1 if source_fname == '.DS_Store' or target_fname == '.DS_Store': continue (source_sample_rate, source_wav_data) = wav.read(SOURCE_DIR + source_fname) (target_sample_rate, target_wav_data) = wav.read(TARGET_DIR + target_fname) source_mfcc_features = np.array( mfcc(source_wav_data, samplerate=source_sample_rate, numcep=num_mfcc_coeffs, nfilt=num_filters, winlen=window_len, winstep=window_step)) target_mfcc_features = np.array( mfcc(target_wav_data, samplerate=target_sample_rate, numcep=num_mfcc_coeffs, nfilt=num_filters, winlen=window_len, winstep=window_step)) # align with FastDTW source_mfcc_features, target_mfcc_features = get_dtw_series( source_mfcc_features, target_mfcc_features) # pad MFCC feature matrices (rows) to max_num_frames source_padded_frames = pad_sequence(source_mfcc_features, max_num_frames) target_padded_frames = pad_sequence(target_mfcc_features, max_num_frames) inputs.append(source_padded_frames) labels.append(target_padded_frames) return inputs, labels
def preprocess_data(num_mfcc_coeffs, max_num_frames): """Processes the training data and returns MFCC vectors for all of them. Args: num_mfcc_coeffs, max_num_frames Returns: train_data: A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask) train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask) """ inputs = [] labels = [] input_masks = [] label_masks = [] #SOURCE_DIR = '../data/cmu_arctic/us-english-male-bdl/wav/' #TARGET_DIR = '../data/cmu_arctic/scottish-english-male-awb/wav/' SOURCE_DIR = '../data/cmu_arctic/mini_a/' TARGET_DIR = '../data/cmu_arctic/mini_b/' for source_fname, target_fname in zip(os.listdir(SOURCE_DIR), os.listdir(TARGET_DIR)): (source_sample_rate, source_wav_data) = wav.read(SOURCE_DIR + source_fname) (target_sample_rate, target_wav_data) = wav.read(TARGET_DIR + target_fname) source_mfcc_features = np.array( mfcc(source_wav_data, samplerate=source_sample_rate, numcep=num_mfcc_coeffs)) target_mfcc_features = np.array( mfcc(target_wav_data, samplerate=target_sample_rate, numcep=num_mfcc_coeffs)) # Aligns the MFCC features matrices using FastDTW. source_mfcc_features, target_mfcc_features = get_dtw_series( source_mfcc_features, target_mfcc_features) # Pads the MFCC feature matrices (rows) to length config.max_num_frames source_padded_frames, source_mask = pad_sequence( source_mfcc_features, max_num_frames) target_padded_frames, target_mask = pad_sequence( target_mfcc_features, max_num_frames) inputs.append(source_padded_frames) input_masks.append(source_mask) labels.append(target_padded_frames) label_masks.append(target_mask) randomized_indices = range(0, len(inputs)) random.shuffle(randomized_indices) inputs = [inputs[i] for i in randomized_indices] input_masks = [input_masks[i] for i in randomized_indices] labels = [labels[i] for i in randomized_indices] label_masks = [label_masks[i] for i in randomized_indices] return inputs, labels
def process_data(self, source_input_dir, target_input_dir): inputs = [] labels = [] index = 0 for source_fname, target_fname in zip(os.listdir(source_input_dir), os.listdir(target_input_dir)): #if index >= 20: # break #index += 1 if source_fname == '.DS_Store' or target_fname == '.DS_Store': continue (source_sample_rate, source_wav_data) = wav.read(source_input_dir + source_fname) (target_sample_rate, target_wav_data) = wav.read(target_input_dir + target_fname) # appendEnergy is False because we want to keep the 0th coefficient source_mfcc_features = np.array( mfcc(source_wav_data, samplerate=source_sample_rate, numcep=self.config.num_mfcc_coeffs, nfilt=self.config.num_filters, winlen=self.config.window_len, winstep=self.config.window_step, appendEnergy=False)) target_mfcc_features = np.array( mfcc(target_wav_data, samplerate=target_sample_rate, numcep=self.config.num_mfcc_coeffs, nfilt=self.config.num_filters, winlen=self.config.window_len, winstep=self.config.window_step, appendEnergy=False)) # Aligns the MFCC features matrices using FastDTW. source_mfcc_features, target_mfcc_features = get_dtw_series( source_mfcc_features, target_mfcc_features) # Pads the MFCC feature matrices (rows) to length config.max_num_frames source_padded_frames, _ = pad_sequence(source_mfcc_features, self.config.max_num_frames) target_padded_frames, _ = pad_sequence(target_mfcc_features, self.config.max_num_frames) inputs.append(source_padded_frames) labels.append(target_padded_frames) return inputs, labels
def preprocess_data(self, config): """Processes the training data and returns MFCC vectors for all of them. Args: config: the Config object with various parameters specified Returns: train_data: A list, one for each training example: accent 1 padded FFT data train_labels: A list, one for each training example: accent 2 padded FFT data """ inputs = [] labels = [] #SOURCE_DIR = '../data/cmu_arctic/us-english-female-slt/wav/' SOURCE_DIR = '../data/cmu_arctic/us-english-male-bdl/wav/' TARGET_DIR = '../data/cmu_arctic/scottish-english-male-awb/wav/' #TARGET_DIR = '../data/cmu_arctic/indian-english-male-ksp/wav/' index = 0 for source_fname, target_fname in zip(os.listdir(SOURCE_DIR), os.listdir(TARGET_DIR)): if index >= 5: break index += 1 if source_fname == '.DS_Store' or target_fname == '.DS_Store': continue (source_sample_rate, source_wav_data) = wav.read(SOURCE_DIR + source_fname) (target_sample_rate, target_wav_data) = wav.read(TARGET_DIR + target_fname) src_fft = fft(source_wav_data) # Both of these are complex numbers tgt_fft = fft(target_wav_data) source_padded_frames = pad_sequence(src_fft, config.max_num_frames, num_samples_per_frame=self.config.num_samples_per_frame) target_padded_frames = pad_sequence(tgt_fft, config.max_num_frames, num_samples_per_frame=self.config.num_samples_per_frame) source_padded_frames = np.reshape(source_padded_frames, (self.config.max_num_frames, self.config.num_samples_per_frame)) target_padded_frames= np.reshape(target_padded_frames, (self.config.max_num_frames, self.config.num_samples_per_frame)) inputs.append(source_padded_frames) labels.append(target_padded_frames) return inputs, labels
def preprocess_data(self, config): """Processes the training data and returns MFCC vectors for all of them. Args: config: the Config object with various parameters specified Returns: train_data: A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask) train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask) """ inputs = [] inputs_corrupted = [] labels = [] input_masks = [] label_masks = [] SOURCE_DIR = '../data/cmu_arctic/us-english-male-bdl/wav/' TARGET_DIR = '../data/cmu_arctic/scottish-english-male-awb/wav/' for source_fname, target_fname in zip(os.listdir(SOURCE_DIR), os.listdir(TARGET_DIR)): (source_sample_rate, source_wav_data) = wav.read(SOURCE_DIR + source_fname) (target_sample_rate, target_wav_data) = wav.read(TARGET_DIR + target_fname) source_mfcc_features = np.array(mfcc(source_wav_data, samplerate=source_sample_rate, numcep=self.config.num_mfcc_coeffs)) target_mfcc_features = np.array(mfcc(target_wav_data, samplerate=target_sample_rate, numcep=self.config.num_mfcc_coeffs)) # Aligns the MFCC features matrices using FastDTW. source_mfcc_features, target_mfcc_features = get_dtw_series(source_mfcc_features, target_mfcc_features) # Corrupt the input (source) MFCC features source_mfcc_features_corrupted = corrupt_input(source_mfcc_features, corr_frac=self.config.corr_frac, corr_type=self.config.corr_type) # Pads the MFCC feature matrices (rows) to length config.max_num_frames source_padded_frames, source_mask = pad_sequence(source_mfcc_features, config.max_num_frames) source_padded_frames_corrupted, _ = pad_sequence(source_mfcc_features_corrupted, config.max_num_frames) target_padded_frames, target_mask = pad_sequence(target_mfcc_features, config.max_num_frames) inputs.append(source_padded_frames) inputs_corrupted.append(source_padded_frames_corrupted)
def preprocess_data(self, config, src=None): """Processes the training data and returns MFCC vectors for all of them. Args: config: the Config object with various parameters specified Returns: train_data: A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask) train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask) """ #<TODO> add functionality to store mfccs rather than having to extract them every single time inputs = [] labels = [] name_dict = {0: 'awb', 1: 'bdl', 2: 'ksp'} for j in range(len(name_dict)): for i in range(51, 594): if i < 10: (source_sample_rate, source_wav_data) = wav.read('cmu_us_' + name_dict[j] + '_arctic/wav/arctic_a000' + str(i) + '.wav') elif 9 < i < 100: (source_sample_rate, source_wav_data) = wav.read('cmu_us_' + name_dict[j] + '_arctic/wav/arctic_a00' + str(i) + '.wav') else: (source_sample_rate, source_wav_data) = wav.read('cmu_us_' + name_dict[j] + '_arctic/wav/arctic_a0' + str(i) + '.wav') source_mfcc = np.array( mfcc(source_wav_data, samplerate=source_sample_rate, numcep=self.config.num_mfcc_coeffs, nfilt=self.config.num_filters, winlen=self.config.window_len, winstep=self.config.window_step)) source_mfcc = source_mfcc / np.mean(source_mfcc, axis=0) source_mfcc_padded, _ = pad_sequence( source_mfcc, self.config.max_num_frames) inputs.append(source_mfcc_padded) labels.append(j) for i in range(1, 540): if i < 10: (source_sample_rate, source_wav_data) = wav.read('cmu_us_' + name_dict[j] + '_arctic/wav/arctic_b000' + str(i) + '.wav') elif 9 < i < 100: (source_sample_rate, source_wav_data) = wav.read('cmu_us_' + name_dict[j] + '_arctic/wav/arctic_b00' + str(i) + '.wav') else: (source_sample_rate, source_wav_data) = wav.read('cmu_us_' + name_dict[j] + '_arctic/wav/arctic_b0' + str(i) + '.wav') source_mfcc = np.array( mfcc(source_wav_data, samplerate=source_sample_rate, numcep=self.config.num_mfcc_coeffs, nfilt=self.config.num_filters, winlen=self.config.window_len, winstep=self.config.window_step)) source_mfcc = source_mfcc / np.mean(source_mfcc, axis=0) source_mfcc_padded, _ = pad_sequence( source_mfcc, self.config.max_num_frames) inputs.append(source_mfcc_padded) labels.append(j) full = zip(inputs, labels) #only here to help with random shuffling full = list(full) from random import shuffle shuffle(full) inputs = [] labels = [] for i in range(len(full)): inputs_, labels_ = full[i] inputs.append(inputs_) labels.append(labels_) return inputs, labels
def preprocess_data(self, config): """Processes the training data and returns MFCC vectors for all of them. Args: config: the Config object with various parameters specified Returns: train_data: A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask) train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask) """ inputs = [] labels = [] input_masks = [] label_masks = [] SOURCE_DIR = '../data/cmu_arctic/scottish-english-male-awb/wav/' TARGET_DIR = '../data/cmu_arctic/us-english-male-bdl/wav/' #TARGET_DIR = '../data/cmu_arctic/scottish-english-male-awb/wav/' index = 0 for source_fname, target_fname in zip(os.listdir(SOURCE_DIR), os.listdir(TARGET_DIR)): if index >= 20: break index += 1 if source_fname == '.DS_Store' or target_fname == '.DS_Store': continue (source_sample_rate, source_wav_data) = wav.read(SOURCE_DIR + source_fname) (target_sample_rate, target_wav_data) = wav.read(TARGET_DIR + target_fname) source_mfcc_features = np.array( mfcc(source_wav_data, samplerate=source_sample_rate, numcep=self.config.num_mfcc_coeffs, nfilt=self.config.num_filters, winlen=self.config.window_len, winstep=self.config.window_step)) target_mfcc_features = np.array( mfcc(target_wav_data, samplerate=target_sample_rate, numcep=self.config.num_mfcc_coeffs, nfilt=self.config.num_filters, winlen=self.config.window_len, winstep=self.config.window_step)) # Aligns the MFCC features matrices using FastDTW. source_mfcc_features, target_mfcc_features = get_dtw_series( source_mfcc_features, target_mfcc_features) # Pads the MFCC feature matrices (rows) to length config.max_num_frames source_padded_frames, source_mask = pad_sequence( source_mfcc_features, config.max_num_frames) target_padded_frames, target_mask = pad_sequence( target_mfcc_features, config.max_num_frames) #if index < 20: # self.output_wave_file(source_padded_frames, filename='src' + str(index)) # self.output_wave_file(target_padded_frames, filename='tgt' + str(index)) #wav.write('source' + str(index) + '.wav', self.config.sample_rate, source_wav_data) #wav.write('target' + str(index) + '.wav', self.config.sample_rate, target_wav_data) #self.eng.soundsc(matlab.double(source_wav_data.tolist()), self.config.sample_rate, nargout=0) #self.eng.soundsc(matlab.double(target_wav_data.tolist()), self.config.sample_rate, nargout=0) #index += 1 inputs.append(source_padded_frames) input_masks.append(source_mask) labels.append(target_padded_frames) label_masks.append(target_mask) return inputs, input_masks, labels, label_masks