예제 #1
0
def preprocess_data(num_mfcc_coeffs, num_filters, window_len, window_step,
                    max_num_frames):
    """Processes the training data and returns MFCC vectors for all of them.
    Args:
        config: the Config object with various parameters specified
    Returns:
        train_data: A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask)
        train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask)
    """
    inputs = []
    labels = []

    SOURCE_DIR = '../data/cmu_arctic/us-english-female-slt/wav/'
    TARGET_DIR = '../data/cmu_arctic/us-english-male-bdl/wav/'
    index = 0
    for source_fname, target_fname in zip(os.listdir(SOURCE_DIR),
                                          os.listdir(TARGET_DIR)):
        if index >= 20:
            break
        index += 1

        if source_fname == '.DS_Store' or target_fname == '.DS_Store':
            continue

        (source_sample_rate,
         source_wav_data) = wav.read(SOURCE_DIR + source_fname)
        (target_sample_rate,
         target_wav_data) = wav.read(TARGET_DIR + target_fname)

        source_mfcc_features = np.array(
            mfcc(source_wav_data,
                 samplerate=source_sample_rate,
                 numcep=num_mfcc_coeffs,
                 nfilt=num_filters,
                 winlen=window_len,
                 winstep=window_step))
        target_mfcc_features = np.array(
            mfcc(target_wav_data,
                 samplerate=target_sample_rate,
                 numcep=num_mfcc_coeffs,
                 nfilt=num_filters,
                 winlen=window_len,
                 winstep=window_step))

        # align with FastDTW
        source_mfcc_features, target_mfcc_features = get_dtw_series(
            source_mfcc_features, target_mfcc_features)

        # pad MFCC feature matrices (rows) to max_num_frames
        source_padded_frames = pad_sequence(source_mfcc_features,
                                            max_num_frames)
        target_padded_frames = pad_sequence(target_mfcc_features,
                                            max_num_frames)

        inputs.append(source_padded_frames)
        labels.append(target_padded_frames)

    return inputs, labels
def preprocess_data(num_mfcc_coeffs, max_num_frames):
    """Processes the training data and returns MFCC vectors for all of them.
    Args:
        num_mfcc_coeffs, max_num_frames
    Returns:
        train_data: A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask)
        train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask)
    """
    inputs = []
    labels = []
    input_masks = []
    label_masks = []

    #SOURCE_DIR = '../data/cmu_arctic/us-english-male-bdl/wav/'
    #TARGET_DIR = '../data/cmu_arctic/scottish-english-male-awb/wav/'
    SOURCE_DIR = '../data/cmu_arctic/mini_a/'
    TARGET_DIR = '../data/cmu_arctic/mini_b/'
    for source_fname, target_fname in zip(os.listdir(SOURCE_DIR),
                                          os.listdir(TARGET_DIR)):
        (source_sample_rate,
         source_wav_data) = wav.read(SOURCE_DIR + source_fname)
        (target_sample_rate,
         target_wav_data) = wav.read(TARGET_DIR + target_fname)

        source_mfcc_features = np.array(
            mfcc(source_wav_data,
                 samplerate=source_sample_rate,
                 numcep=num_mfcc_coeffs))
        target_mfcc_features = np.array(
            mfcc(target_wav_data,
                 samplerate=target_sample_rate,
                 numcep=num_mfcc_coeffs))

        # Aligns the MFCC features matrices using FastDTW.
        source_mfcc_features, target_mfcc_features = get_dtw_series(
            source_mfcc_features, target_mfcc_features)

        # Pads the MFCC feature matrices (rows) to length config.max_num_frames
        source_padded_frames, source_mask = pad_sequence(
            source_mfcc_features, max_num_frames)
        target_padded_frames, target_mask = pad_sequence(
            target_mfcc_features, max_num_frames)

        inputs.append(source_padded_frames)
        input_masks.append(source_mask)
        labels.append(target_padded_frames)
        label_masks.append(target_mask)

    randomized_indices = range(0, len(inputs))
    random.shuffle(randomized_indices)
    inputs = [inputs[i] for i in randomized_indices]
    input_masks = [input_masks[i] for i in randomized_indices]
    labels = [labels[i] for i in randomized_indices]
    label_masks = [label_masks[i] for i in randomized_indices]

    return inputs, labels
    def process_data(self, source_input_dir, target_input_dir):
        inputs = []
        labels = []

        index = 0
        for source_fname, target_fname in zip(os.listdir(source_input_dir),
                                              os.listdir(target_input_dir)):
            #if index >= 20:
            # break
            #index += 1

            if source_fname == '.DS_Store' or target_fname == '.DS_Store':
                continue

            (source_sample_rate,
             source_wav_data) = wav.read(source_input_dir + source_fname)
            (target_sample_rate,
             target_wav_data) = wav.read(target_input_dir + target_fname)

            # appendEnergy is False because we want to keep the 0th coefficient
            source_mfcc_features = np.array(
                mfcc(source_wav_data,
                     samplerate=source_sample_rate,
                     numcep=self.config.num_mfcc_coeffs,
                     nfilt=self.config.num_filters,
                     winlen=self.config.window_len,
                     winstep=self.config.window_step,
                     appendEnergy=False))
            target_mfcc_features = np.array(
                mfcc(target_wav_data,
                     samplerate=target_sample_rate,
                     numcep=self.config.num_mfcc_coeffs,
                     nfilt=self.config.num_filters,
                     winlen=self.config.window_len,
                     winstep=self.config.window_step,
                     appendEnergy=False))

            # Aligns the MFCC features matrices using FastDTW.
            source_mfcc_features, target_mfcc_features = get_dtw_series(
                source_mfcc_features, target_mfcc_features)

            # Pads the MFCC feature matrices (rows) to length config.max_num_frames
            source_padded_frames, _ = pad_sequence(source_mfcc_features,
                                                   self.config.max_num_frames)
            target_padded_frames, _ = pad_sequence(target_mfcc_features,
                                                   self.config.max_num_frames)

            inputs.append(source_padded_frames)
            labels.append(target_padded_frames)

        return inputs, labels
		def preprocess_data(self, config):
				"""Processes the training data and returns MFCC vectors for all of them.
				Args:
					config: the Config object with various parameters specified
				Returns:
					train_data:	A list, one for each training example: accent 1 padded FFT data 
					train_labels: A list, one for each training example: accent 2 padded FFT data
				"""
				inputs = [] 
				labels = []	
				
				#SOURCE_DIR = '../data/cmu_arctic/us-english-female-slt/wav/'	
				SOURCE_DIR = '../data/cmu_arctic/us-english-male-bdl/wav/'
				TARGET_DIR = '../data/cmu_arctic/scottish-english-male-awb/wav/'	
				#TARGET_DIR = '../data/cmu_arctic/indian-english-male-ksp/wav/'
				index = 0
				for source_fname, target_fname in zip(os.listdir(SOURCE_DIR), os.listdir(TARGET_DIR)):
					if index >= 5:
						break
					index += 1
					if source_fname == '.DS_Store' or target_fname == '.DS_Store':
						continue
		
					(source_sample_rate, source_wav_data) = wav.read(SOURCE_DIR + source_fname) 
					(target_sample_rate, target_wav_data) = wav.read(TARGET_DIR + target_fname)

					src_fft = fft(source_wav_data)	# Both of these are complex numbers
					tgt_fft = fft(target_wav_data)	

					source_padded_frames = pad_sequence(src_fft, config.max_num_frames, 
								num_samples_per_frame=self.config.num_samples_per_frame)
					target_padded_frames = pad_sequence(tgt_fft, config.max_num_frames, 
								num_samples_per_frame=self.config.num_samples_per_frame)

					source_padded_frames = np.reshape(source_padded_frames, (self.config.max_num_frames, self.config.num_samples_per_frame)) 
					target_padded_frames= np.reshape(target_padded_frames, (self.config.max_num_frames, self.config.num_samples_per_frame))

					inputs.append(source_padded_frames) 
					labels.append(target_padded_frames) 

				return inputs, labels
예제 #5
0
		def preprocess_data(self, config):
				"""Processes the training data and returns MFCC vectors for all of them.
				Args:
					config: the Config object with various parameters specified
				Returns:
					train_data:	A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask)
					train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask)
				"""
				inputs = [] 
				inputs_corrupted = []
				labels = []	
				input_masks = []
				label_masks = []
				
				SOURCE_DIR = '../data/cmu_arctic/us-english-male-bdl/wav/'
				TARGET_DIR = '../data/cmu_arctic/scottish-english-male-awb/wav/'
				for source_fname, target_fname in zip(os.listdir(SOURCE_DIR), os.listdir(TARGET_DIR)):
					(source_sample_rate, source_wav_data) = wav.read(SOURCE_DIR + source_fname) 
					(target_sample_rate, target_wav_data) = wav.read(TARGET_DIR + target_fname)

					source_mfcc_features = np.array(mfcc(source_wav_data, samplerate=source_sample_rate, numcep=self.config.num_mfcc_coeffs))
					target_mfcc_features = np.array(mfcc(target_wav_data, samplerate=target_sample_rate, numcep=self.config.num_mfcc_coeffs))

					# Aligns the MFCC features matrices using FastDTW.
					source_mfcc_features, target_mfcc_features = get_dtw_series(source_mfcc_features, target_mfcc_features)

					# Corrupt the input (source) MFCC features
					source_mfcc_features_corrupted = corrupt_input(source_mfcc_features, corr_frac=self.config.corr_frac,
																												 corr_type=self.config.corr_type) 

					# Pads the MFCC feature matrices (rows) to length config.max_num_frames
					source_padded_frames, source_mask = pad_sequence(source_mfcc_features, config.max_num_frames)
					source_padded_frames_corrupted, _ = pad_sequence(source_mfcc_features_corrupted, config.max_num_frames)
					target_padded_frames, target_mask = pad_sequence(target_mfcc_features, config.max_num_frames)

					inputs.append(source_padded_frames) 
					inputs_corrupted.append(source_padded_frames_corrupted)
 def preprocess_data(self, config, src=None):
     """Processes the training data and returns MFCC vectors for all of them.
     Args:
         config: the Config object with various parameters specified
     Returns:
         train_data:	A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask)
         train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask)
     """
     #<TODO> add functionality to store mfccs rather than having to extract them every single time
     inputs = []
     labels = []
     name_dict = {0: 'awb', 1: 'bdl', 2: 'ksp'}
     for j in range(len(name_dict)):
         for i in range(51, 594):
             if i < 10:
                 (source_sample_rate,
                  source_wav_data) = wav.read('cmu_us_' + name_dict[j] +
                                              '_arctic/wav/arctic_a000' +
                                              str(i) + '.wav')
             elif 9 < i < 100:
                 (source_sample_rate,
                  source_wav_data) = wav.read('cmu_us_' + name_dict[j] +
                                              '_arctic/wav/arctic_a00' +
                                              str(i) + '.wav')
             else:
                 (source_sample_rate,
                  source_wav_data) = wav.read('cmu_us_' + name_dict[j] +
                                              '_arctic/wav/arctic_a0' +
                                              str(i) + '.wav')
             source_mfcc = np.array(
                 mfcc(source_wav_data,
                      samplerate=source_sample_rate,
                      numcep=self.config.num_mfcc_coeffs,
                      nfilt=self.config.num_filters,
                      winlen=self.config.window_len,
                      winstep=self.config.window_step))
             source_mfcc = source_mfcc / np.mean(source_mfcc, axis=0)
             source_mfcc_padded, _ = pad_sequence(
                 source_mfcc, self.config.max_num_frames)
             inputs.append(source_mfcc_padded)
             labels.append(j)
         for i in range(1, 540):
             if i < 10:
                 (source_sample_rate,
                  source_wav_data) = wav.read('cmu_us_' + name_dict[j] +
                                              '_arctic/wav/arctic_b000' +
                                              str(i) + '.wav')
             elif 9 < i < 100:
                 (source_sample_rate,
                  source_wav_data) = wav.read('cmu_us_' + name_dict[j] +
                                              '_arctic/wav/arctic_b00' +
                                              str(i) + '.wav')
             else:
                 (source_sample_rate,
                  source_wav_data) = wav.read('cmu_us_' + name_dict[j] +
                                              '_arctic/wav/arctic_b0' +
                                              str(i) + '.wav')
             source_mfcc = np.array(
                 mfcc(source_wav_data,
                      samplerate=source_sample_rate,
                      numcep=self.config.num_mfcc_coeffs,
                      nfilt=self.config.num_filters,
                      winlen=self.config.window_len,
                      winstep=self.config.window_step))
             source_mfcc = source_mfcc / np.mean(source_mfcc, axis=0)
             source_mfcc_padded, _ = pad_sequence(
                 source_mfcc, self.config.max_num_frames)
             inputs.append(source_mfcc_padded)
             labels.append(j)
     full = zip(inputs, labels)  #only here to help with random shuffling
     full = list(full)
     from random import shuffle
     shuffle(full)
     inputs = []
     labels = []
     for i in range(len(full)):
         inputs_, labels_ = full[i]
         inputs.append(inputs_)
         labels.append(labels_)
     return inputs, labels
    def preprocess_data(self, config):
        """Processes the training data and returns MFCC vectors for all of them.
				Args:
					config: the Config object with various parameters specified
				Returns:
					train_data:	A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask)
					train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask)
				"""
        inputs = []
        labels = []
        input_masks = []
        label_masks = []

        SOURCE_DIR = '../data/cmu_arctic/scottish-english-male-awb/wav/'
        TARGET_DIR = '../data/cmu_arctic/us-english-male-bdl/wav/'
        #TARGET_DIR = '../data/cmu_arctic/scottish-english-male-awb/wav/'
        index = 0
        for source_fname, target_fname in zip(os.listdir(SOURCE_DIR),
                                              os.listdir(TARGET_DIR)):
            if index >= 20:
                break
            index += 1

            if source_fname == '.DS_Store' or target_fname == '.DS_Store':
                continue

            (source_sample_rate,
             source_wav_data) = wav.read(SOURCE_DIR + source_fname)
            (target_sample_rate,
             target_wav_data) = wav.read(TARGET_DIR + target_fname)

            source_mfcc_features = np.array(
                mfcc(source_wav_data,
                     samplerate=source_sample_rate,
                     numcep=self.config.num_mfcc_coeffs,
                     nfilt=self.config.num_filters,
                     winlen=self.config.window_len,
                     winstep=self.config.window_step))
            target_mfcc_features = np.array(
                mfcc(target_wav_data,
                     samplerate=target_sample_rate,
                     numcep=self.config.num_mfcc_coeffs,
                     nfilt=self.config.num_filters,
                     winlen=self.config.window_len,
                     winstep=self.config.window_step))

            # Aligns the MFCC features matrices using FastDTW.
            source_mfcc_features, target_mfcc_features = get_dtw_series(
                source_mfcc_features, target_mfcc_features)

            # Pads the MFCC feature matrices (rows) to length config.max_num_frames
            source_padded_frames, source_mask = pad_sequence(
                source_mfcc_features, config.max_num_frames)
            target_padded_frames, target_mask = pad_sequence(
                target_mfcc_features, config.max_num_frames)

            #if index < 20:
            #	self.output_wave_file(source_padded_frames, filename='src' + str(index))
            #	self.output_wave_file(target_padded_frames, filename='tgt' + str(index))
            #wav.write('source' + str(index) + '.wav', self.config.sample_rate, source_wav_data)
            #wav.write('target' + str(index) + '.wav', self.config.sample_rate, target_wav_data)
            #self.eng.soundsc(matlab.double(source_wav_data.tolist()), self.config.sample_rate, nargout=0)
            #self.eng.soundsc(matlab.double(target_wav_data.tolist()), self.config.sample_rate, nargout=0)
            #index += 1

            inputs.append(source_padded_frames)
            input_masks.append(source_mask)
            labels.append(target_padded_frames)
            label_masks.append(target_mask)

        return inputs, input_masks, labels, label_masks