Пример #1
0
def preprocess_data(num_mfcc_coeffs, num_filters, window_len, window_step,
                    max_num_frames):
    """Processes the training data and returns MFCC vectors for all of them.
    Args:
        config: the Config object with various parameters specified
    Returns:
        train_data: A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask)
        train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask)
    """
    inputs = []
    labels = []

    SOURCE_DIR = '../data/cmu_arctic/us-english-female-slt/wav/'
    TARGET_DIR = '../data/cmu_arctic/us-english-male-bdl/wav/'
    index = 0
    for source_fname, target_fname in zip(os.listdir(SOURCE_DIR),
                                          os.listdir(TARGET_DIR)):
        if index >= 20:
            break
        index += 1

        if source_fname == '.DS_Store' or target_fname == '.DS_Store':
            continue

        (source_sample_rate,
         source_wav_data) = wav.read(SOURCE_DIR + source_fname)
        (target_sample_rate,
         target_wav_data) = wav.read(TARGET_DIR + target_fname)

        source_mfcc_features = np.array(
            mfcc(source_wav_data,
                 samplerate=source_sample_rate,
                 numcep=num_mfcc_coeffs,
                 nfilt=num_filters,
                 winlen=window_len,
                 winstep=window_step))
        target_mfcc_features = np.array(
            mfcc(target_wav_data,
                 samplerate=target_sample_rate,
                 numcep=num_mfcc_coeffs,
                 nfilt=num_filters,
                 winlen=window_len,
                 winstep=window_step))

        # align with FastDTW
        source_mfcc_features, target_mfcc_features = get_dtw_series(
            source_mfcc_features, target_mfcc_features)

        # pad MFCC feature matrices (rows) to max_num_frames
        source_padded_frames = pad_sequence(source_mfcc_features,
                                            max_num_frames)
        target_padded_frames = pad_sequence(target_mfcc_features,
                                            max_num_frames)

        inputs.append(source_padded_frames)
        labels.append(target_padded_frames)

    return inputs, labels
def preprocess_data(num_mfcc_coeffs, max_num_frames):
    """Processes the training data and returns MFCC vectors for all of them.
    Args:
        num_mfcc_coeffs, max_num_frames
    Returns:
        train_data: A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask)
        train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask)
    """
    inputs = []
    labels = []
    input_masks = []
    label_masks = []

    #SOURCE_DIR = '../data/cmu_arctic/us-english-male-bdl/wav/'
    #TARGET_DIR = '../data/cmu_arctic/scottish-english-male-awb/wav/'
    SOURCE_DIR = '../data/cmu_arctic/mini_a/'
    TARGET_DIR = '../data/cmu_arctic/mini_b/'
    for source_fname, target_fname in zip(os.listdir(SOURCE_DIR),
                                          os.listdir(TARGET_DIR)):
        (source_sample_rate,
         source_wav_data) = wav.read(SOURCE_DIR + source_fname)
        (target_sample_rate,
         target_wav_data) = wav.read(TARGET_DIR + target_fname)

        source_mfcc_features = np.array(
            mfcc(source_wav_data,
                 samplerate=source_sample_rate,
                 numcep=num_mfcc_coeffs))
        target_mfcc_features = np.array(
            mfcc(target_wav_data,
                 samplerate=target_sample_rate,
                 numcep=num_mfcc_coeffs))

        # Aligns the MFCC features matrices using FastDTW.
        source_mfcc_features, target_mfcc_features = get_dtw_series(
            source_mfcc_features, target_mfcc_features)

        # Pads the MFCC feature matrices (rows) to length config.max_num_frames
        source_padded_frames, source_mask = pad_sequence(
            source_mfcc_features, max_num_frames)
        target_padded_frames, target_mask = pad_sequence(
            target_mfcc_features, max_num_frames)

        inputs.append(source_padded_frames)
        input_masks.append(source_mask)
        labels.append(target_padded_frames)
        label_masks.append(target_mask)

    randomized_indices = range(0, len(inputs))
    random.shuffle(randomized_indices)
    inputs = [inputs[i] for i in randomized_indices]
    input_masks = [input_masks[i] for i in randomized_indices]
    labels = [labels[i] for i in randomized_indices]
    label_masks = [label_masks[i] for i in randomized_indices]

    return inputs, labels
    def process_data(self, source_input_dir, target_input_dir):
        inputs = []
        labels = []

        index = 0
        for source_fname, target_fname in zip(os.listdir(source_input_dir),
                                              os.listdir(target_input_dir)):
            #if index >= 20:
            # break
            #index += 1

            if source_fname == '.DS_Store' or target_fname == '.DS_Store':
                continue

            (source_sample_rate,
             source_wav_data) = wav.read(source_input_dir + source_fname)
            (target_sample_rate,
             target_wav_data) = wav.read(target_input_dir + target_fname)

            # appendEnergy is False because we want to keep the 0th coefficient
            source_mfcc_features = np.array(
                mfcc(source_wav_data,
                     samplerate=source_sample_rate,
                     numcep=self.config.num_mfcc_coeffs,
                     nfilt=self.config.num_filters,
                     winlen=self.config.window_len,
                     winstep=self.config.window_step,
                     appendEnergy=False))
            target_mfcc_features = np.array(
                mfcc(target_wav_data,
                     samplerate=target_sample_rate,
                     numcep=self.config.num_mfcc_coeffs,
                     nfilt=self.config.num_filters,
                     winlen=self.config.window_len,
                     winstep=self.config.window_step,
                     appendEnergy=False))

            # Aligns the MFCC features matrices using FastDTW.
            source_mfcc_features, target_mfcc_features = get_dtw_series(
                source_mfcc_features, target_mfcc_features)

            # Pads the MFCC feature matrices (rows) to length config.max_num_frames
            source_padded_frames, _ = pad_sequence(source_mfcc_features,
                                                   self.config.max_num_frames)
            target_padded_frames, _ = pad_sequence(target_mfcc_features,
                                                   self.config.max_num_frames)

            inputs.append(source_padded_frames)
            labels.append(target_padded_frames)

        return inputs, labels
    def preprocess_data(self, config):
        """Processes the training data and returns MFCC vectors for all of them.
				Args:
					config: the Config object with various parameters specified
				Returns:
					train_data:	A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask)
					train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask)
				"""
        inputs = []
        labels = []

        TARGET_DIR = '../data/cmu_arctic/scottish-english-male-awb/wav/'
        SOURCE_DIR = '../data/cmu_arctic/us-english-male-bdl/wav/'
        index = 0
        for source_fname, target_fname in zip(os.listdir(SOURCE_DIR),
                                              os.listdir(TARGET_DIR)):
            if index >= 500:
                break
            index += 1

            if source_fname == '.DS_Store' or target_fname == '.DS_Store':
                continue

            (source_sample_rate,
             source_wav_data) = wav.read(SOURCE_DIR + source_fname)
            (target_sample_rate,
             target_wav_data) = wav.read(TARGET_DIR + target_fname)

            # Aligns the MFCC features matrices using FastDTW.
            source_features, target_features = get_dtw_series(
                source_wav_data, target_wav_data)

            # Pads the MFCC feature matrices (rows) to length config.max_num_frames
            source_padded_frames = self.pad_sequence(source_features)
            target_padded_frames = self.pad_sequence(target_features)

            inputs.append(source_padded_frames)
            labels.append(target_padded_frames)

        return inputs, labels
Пример #5
0
		def preprocess_data(self, config):
				"""Processes the training data and returns MFCC vectors for all of them.
				Args:
					config: the Config object with various parameters specified
				Returns:
					train_data:	A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask)
					train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask)
				"""
				inputs = [] 
				inputs_corrupted = []
				labels = []	
				input_masks = []
				label_masks = []
				
				SOURCE_DIR = '../data/cmu_arctic/us-english-male-bdl/wav/'
				TARGET_DIR = '../data/cmu_arctic/scottish-english-male-awb/wav/'
				for source_fname, target_fname in zip(os.listdir(SOURCE_DIR), os.listdir(TARGET_DIR)):
					(source_sample_rate, source_wav_data) = wav.read(SOURCE_DIR + source_fname) 
					(target_sample_rate, target_wav_data) = wav.read(TARGET_DIR + target_fname)

					source_mfcc_features = np.array(mfcc(source_wav_data, samplerate=source_sample_rate, numcep=self.config.num_mfcc_coeffs))
					target_mfcc_features = np.array(mfcc(target_wav_data, samplerate=target_sample_rate, numcep=self.config.num_mfcc_coeffs))

					# Aligns the MFCC features matrices using FastDTW.
					source_mfcc_features, target_mfcc_features = get_dtw_series(source_mfcc_features, target_mfcc_features)

					# Corrupt the input (source) MFCC features
					source_mfcc_features_corrupted = corrupt_input(source_mfcc_features, corr_frac=self.config.corr_frac,
																												 corr_type=self.config.corr_type) 

					# Pads the MFCC feature matrices (rows) to length config.max_num_frames
					source_padded_frames, source_mask = pad_sequence(source_mfcc_features, config.max_num_frames)
					source_padded_frames_corrupted, _ = pad_sequence(source_mfcc_features_corrupted, config.max_num_frames)
					target_padded_frames, target_mask = pad_sequence(target_mfcc_features, config.max_num_frames)

					inputs.append(source_padded_frames) 
					inputs_corrupted.append(source_padded_frames_corrupted)
Пример #6
0
    def preprocess_data(self, config, SOURCE_DIR, TARGET_DIR=None):
        """Processes the training data and returns MFCC vectors for all of them.
				Args:
					config: the Config object with various parameters specified
				Returns:
					train_data:	A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask)
					train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask)
				"""
        inputs = []
        labels = []

        index = 0
        if TARGET_DIR:
            for source_fname, target_fname in zip(os.listdir(SOURCE_DIR),
                                                  os.listdir(TARGET_DIR)):
                #if index >= 100:
                #	break
                #index += 1

                if source_fname == '.DS_Store' or target_fname == '.DS_Store':
                    continue

                (source_sample_rate,
                 source_wav_data) = wav.read(SOURCE_DIR + source_fname)
                (target_sample_rate,
                 target_wav_data) = wav.read(TARGET_DIR + target_fname)

                source_mfcc_features = np.array(
                    mfcc(source_wav_data,
                         samplerate=source_sample_rate,
                         numcep=self.config.num_mfcc_coeffs,
                         nfilt=self.config.num_filters,
                         winlen=self.config.window_len,
                         winstep=self.config.window_step))
                target_mfcc_features = np.array(
                    mfcc(target_wav_data,
                         samplerate=target_sample_rate,
                         numcep=self.config.num_mfcc_coeffs,
                         nfilt=self.config.num_filters,
                         winlen=self.config.window_len,
                         winstep=self.config.window_step))

                # Aligns the MFCC features matrices using FastDTW.
                source_mfcc_features, target_mfcc_features = get_dtw_series(
                    source_mfcc_features, target_mfcc_features)

                # Pads the MFCC feature matrices (rows) to length config.max_num_frames
                source_padded_frames = self.pad_sequence(source_mfcc_features)
                target_padded_frames = self.pad_sequence(target_mfcc_features)

                inputs.append(source_padded_frames)
                inputs.append(target_padded_frames)
                labels.append(0)
                labels.append(1)
        else:
            for source_fname in os.listdir(SOURCE_DIR):
                #if index >= 100:
                #	break
                #index += 1

                if source_fname == '.DS_Store':
                    continue

                (source_sample_rate,
                 source_wav_data) = wav.read(SOURCE_DIR + source_fname)

                source_mfcc_features = np.array(
                    mfcc(source_wav_data,
                         samplerate=source_sample_rate,
                         numcep=self.config.num_mfcc_coeffs,
                         nfilt=self.config.num_filters,
                         winlen=self.config.window_len,
                         winstep=self.config.window_step))

                # Aligns the MFCC features matrices using FastDTW.
                #source_mfcc_features, _ = get_dtw_series(source_mfcc_features, source_mfcc_features)

                # Pads the MFCC feature matrices (rows) to length config.max_num_frames
                source_padded_frames = self.pad_sequence(source_mfcc_features)

                inputs.append(source_padded_frames)
                labels.append(1)

        return inputs, labels
    def preprocess_data(self, config):
        """Processes the training data and returns MFCC vectors for all of them.
				Args:
					config: the Config object with various parameters specified
				Returns:
					train_data:	A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask)
					train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask)
				"""
        inputs = []
        labels = []
        input_masks = []
        label_masks = []

        SOURCE_DIR = '../data/cmu_arctic/scottish-english-male-awb/wav/'
        TARGET_DIR = '../data/cmu_arctic/us-english-male-bdl/wav/'
        #TARGET_DIR = '../data/cmu_arctic/scottish-english-male-awb/wav/'
        index = 0
        for source_fname, target_fname in zip(os.listdir(SOURCE_DIR),
                                              os.listdir(TARGET_DIR)):
            if index >= 20:
                break
            index += 1

            if source_fname == '.DS_Store' or target_fname == '.DS_Store':
                continue

            (source_sample_rate,
             source_wav_data) = wav.read(SOURCE_DIR + source_fname)
            (target_sample_rate,
             target_wav_data) = wav.read(TARGET_DIR + target_fname)

            source_mfcc_features = np.array(
                mfcc(source_wav_data,
                     samplerate=source_sample_rate,
                     numcep=self.config.num_mfcc_coeffs,
                     nfilt=self.config.num_filters,
                     winlen=self.config.window_len,
                     winstep=self.config.window_step))
            target_mfcc_features = np.array(
                mfcc(target_wav_data,
                     samplerate=target_sample_rate,
                     numcep=self.config.num_mfcc_coeffs,
                     nfilt=self.config.num_filters,
                     winlen=self.config.window_len,
                     winstep=self.config.window_step))

            # Aligns the MFCC features matrices using FastDTW.
            source_mfcc_features, target_mfcc_features = get_dtw_series(
                source_mfcc_features, target_mfcc_features)

            # Pads the MFCC feature matrices (rows) to length config.max_num_frames
            source_padded_frames, source_mask = pad_sequence(
                source_mfcc_features, config.max_num_frames)
            target_padded_frames, target_mask = pad_sequence(
                target_mfcc_features, config.max_num_frames)

            #if index < 20:
            #	self.output_wave_file(source_padded_frames, filename='src' + str(index))
            #	self.output_wave_file(target_padded_frames, filename='tgt' + str(index))
            #wav.write('source' + str(index) + '.wav', self.config.sample_rate, source_wav_data)
            #wav.write('target' + str(index) + '.wav', self.config.sample_rate, target_wav_data)
            #self.eng.soundsc(matlab.double(source_wav_data.tolist()), self.config.sample_rate, nargout=0)
            #self.eng.soundsc(matlab.double(target_wav_data.tolist()), self.config.sample_rate, nargout=0)
            #index += 1

            inputs.append(source_padded_frames)
            input_masks.append(source_mask)
            labels.append(target_padded_frames)
            label_masks.append(target_mask)

        return inputs, input_masks, labels, label_masks