예제 #1
0
    def get_normalisation_params(self, dir_out, file_name=None):
        """
        Read mean std_dev values from a file.
        Save them in self.norm_params

        :param dir_out:    Directory containing the normalisation file.
        :param file_name:  Prefix of normalisation file. Expects file to
                           be named <file_name-><MeanStdDevExtractor.file_name_appendix>.npz
        :return:           Tuple of normalisation parameters (mean, std_dev).
        """

        if dir_out is None:
            dir_out = self.dir_labels

        if file_name is None:
            file_name = ""
        elif os.path.basename(file_name) != "":
            file_name += "-"
        full_file_name = file_name + MeanStdDevExtractor.file_name_appendix

        try:
            self.norm_params = MeanStdDevExtractor.load(
                os.path.join(dir_out, full_file_name + ".npz"))
        except FileNotFoundError:
            # LEGACY support
            self.norm_params = MeanStdDevExtractor.load(
                os.path.join(dir_out, full_file_name + ".bin"))

        return self.norm_params
예제 #2
0
    def __init__(self, config: Config) -> None:
        super().__init__(config)

        self.directory = config.directory
        self.features = config.features
        self.indices = config.indices

        if config.norm_type == NpzDataReader.Config.NormType.NONE:
            self.normaliser = None
        elif config.norm_type == NpzDataReader.Config.NormType.MEAN_VARIANCE:
            self.normaliser = MeanCovarianceExtractor()
        elif config.norm_type == NpzDataReader.Config.NormType.MEAN_STDDEV:
            self.normaliser = MeanStdDevExtractor()
        elif config.norm_type == NpzDataReader.Config.NormType.MIN_MAX:
            self.normaliser = MinMaxExtractor()
        else:
            raise NotImplementedError("Unknown norm_type {}".format(
                config.norm_type))

        if config.norm_params is not None:
            self.norm_params = config.norm_params
        elif config.norm_params_path is not None:
            self.norm_params = self.normaliser.load(config.norm_params_path)
        else:
            self.norm_params = None

        self.preprocessing_fn = config.preprocessing_fn
        self.preprocess_before_norm = config.preprocess_before_norm
        self.postprocessing_fn = config.postprocessing_fn
        self.postprocess_before_norm = config.postprocess_before_norm
예제 #3
0
    def get_normalisation_params(self, dir_out, file_name=None):
        """
        Read the mean std_dev values from a file.
        Save them in self.norm_params.

        :param dir_out:       Directory containing the normalisation file, usually the atom directory.
        :param file_name:     Prefix of normalisation file.
                              Expects file to be named <file_name-><MeanStdDevExtractor.file_name_appendix>.bin
        :return:              Tuple of normalisation parameters (mean, std_dev).
        """

        full_file_name = (file_name + "-" if file_name is not None else
                          "") + MeanStdDevExtractor.file_name_appendix + ".bin"

        # Use the same normalisation parameters for the LF0 curve without phrase curve
        # as for atoms. The phrase directory is the same as the atom directory.
        mean, std_dev = MeanStdDevExtractor.load(
            os.path.join(self.dir_phrase, full_file_name))
        mean, std_dev = mean[:, 0:
                             1], std_dev[:, 0:
                                         1]  # Dimension of both is 1 x 2(atom amplitude, theta).

        # Manually set V/UV normalisation parameters and save the concatenated normalisation parameters locally.
        self.norm_params = np.concatenate((mean, np.zeros((1, 1))), axis=1),\
                           np.concatenate((std_dev, np.ones((1, 1))), axis=1)

        return self.norm_params
예제 #4
0
    def get_normalisation_params(self, dir_out, file_name=None):
        """
        Read the mean std_dev values from a file.
        Save them in self.norm_params.

        :param dir_out:       Directory containing the normalisation file.
        :param file_name:     Prefix of normalisation file.
                              Expects file to be named <file_name-><MeanStdDevExtractor.file_name_appendix>.bin
        :return:              Tuple of normalisation parameters (mean, std_dev).
        """

        full_file_name = (file_name + "-" if file_name is not None else
                          "") + MeanStdDevExtractor.file_name_appendix + ".bin"

        if not self.add_deltas:
            # Collect all means and std_devs in a list.
            all_mean = list()
            all_std_dev = list()
            # Load normalisation parameters for all features.
            mean, std_dev = MeanStdDevExtractor.load(
                os.path.join(dir_out, self.dir_lf0, full_file_name))
            all_mean.append(np.atleast_2d(mean))
            all_std_dev.append(np.atleast_2d(std_dev))
            # Manually set vuv normalisation parameters.
            # Note that vuv normalisation parameters are not saved in gen_data method (except for add_deltas=True).
            all_mean.append(np.atleast_2d(0.0))
            all_std_dev.append(np.atleast_2d(1.0))

            # for dir_feature in [self.dir_lf0, self.dir_vuv]:
            #     mean, std_dev = MeanStdDevExtractor.load(os.path.join(dir_out, dir_feature, full_file_name))
            #     all_mean.append(np.atleast_2d(mean))
            #     all_std_dev.append(np.atleast_2d(std_dev))

            # Save the concatenated normalisation parameters locally.
            self.norm_params = np.concatenate(
                all_mean, axis=1), np.concatenate(all_std_dev, axis=1)
        else:
            # Save the normalisation parameters locally.
            # VUV normalisation parameters are manually set to mean=0 and std_dev=1 in gen_data method.
            self.norm_params = MeanStdDevExtractor.load(
                os.path.join(dir_out, self.dir_deltas, full_file_name))

        return self.norm_params
예제 #5
0
    def gen_data(dir_in: os.PathLike,
                 opensmile_config_file: os.PathLike,
                 feature_name: str,
                 num_frames: int,
                 dir_out: os.PathLike = None,
                 file_id_list: os.PathLike = None,
                 id_list: List[str] = None,
                 file_ext: str = "wav",
                 return_dict: bool = False) -> Tuple:

        if file_id_list is None:
            file_id_list_name = ""
        else:
            id_list, file_id_list_name = OpenSMILELabelGen._get_id_list(
                dir_in, file_id_list, id_list, file_ext)
            if file_id_list_name is not None and file_id_list_name != "":
                file_id_list_name += "-"

        if return_dict:
            label_dict = {}

        normaliser = MeanStdDevExtractor()

        for file_name in id_list:
            features = OpenSMILELabelGen.extract_features(
                config_file=opensmile_config_file,
                file_path=os.path.join(dir_in, file_name + "." + file_ext),
                num_frames=num_frames)

            if return_dict:
                label_dict[file_name] = features

            normaliser.add_sample(features)

            if dir_out is not None:
                out_file_path = os.path.join(dir_out, file_name)
                OpenSMILELabelGen._save_to_npz(file_path=out_file_path,
                                               features=features.astype(
                                                   np.float32),
                                               feature_name=feature_name)

        if dir_out is not None:
            norm_file_path = os.path.join(dir_out,
                                          file_id_list_name + feature_name)
            logging.info("Write norm_prams to {}".format(norm_file_path))
            normaliser.save(norm_file_path)

        mean, std_dev = normaliser.get_params()
        if return_dict:
            return label_dict, mean, std_dev
        else:
            return mean, std_dev
예제 #6
0
    def get_normalisation_params(self, dir_out, file_name=None):
        """
        Read mean std_dev values from a file.
        Save them in self.norm_params

        :param dir_out:       Directory containing the normalisation file.
        :param file_name:     Prefix of normalisation file.
                              Expects file to be named <file_name-><MeanStdDevExtractor.file_name_appendix>.bin
        :return:              Tuple of normalisation parameters (mean, std_dev).
        """

        full_file_name = (file_name + "-" if file_name is not None else "") + MeanStdDevExtractor.file_name_appendix + ".bin"
        self.norm_params = MeanStdDevExtractor.load(os.path.join(self.dir_labels, full_file_name))

        return self.norm_params
예제 #7
0
    def gen_data(self, dir_in, dir_out=None, file_id_list="", id_list=None, add_deltas=False, return_dict=False):
        """
        Prepare LF0 and V/UV features from audio files. If add_delta is false each numpy array has the dimension
        num_frames x 2 [f0, vuv], otherwise the deltas and double deltas are added between
        the features resulting in num_frames x 4 [lf0(3*1), vuv].

        :param dir_in:         Directory where the .wav files are stored for each utterance to process.
        :param dir_out:        Main directory where the labels and normalisation parameters are saved to subdirectories.
                               If None, labels are not saved.
        :param file_id_list:   Name of the file containing the ids. Normalisation parameters are saved using
                               this name to differentiate parameters between subsets.
        :param id_list:        The list of utterances to process.
                               Should have the form uttId1 \\n uttId2 \\n ...\\n uttIdN.
                               If None, all file in audio_dir are used.
        :param add_deltas:     Add deltas and double deltas to all features except vuv.
        :param return_dict:    If true, returns an OrderedDict of all samples as first output.
        :return:               Returns two normalisation parameters as tuple. If return_dict is True it returns
                               all processed labels in an OrderedDict followed by the two normalisation parameters.
        """

        # Fill file_id_list by .wav files in dir_in if not given and set an appropriate file_id_list_name.
        if id_list is None:
            id_list = list()
            filenames = glob.glob(os.path.join(dir_in, "*.wav"))
            for filename in filenames:
                id_list.append(os.path.splitext(os.path.basename(filename))[0])
            file_id_list_name = "all"
        else:
            file_id_list_name = os.path.splitext(os.path.basename(file_id_list))[0]

        # Create directories in dir_out if it is given.
        if dir_out is not None:
            if add_deltas:
                makedirs_safe(os.path.join(dir_out, LF0LabelGen.dir_deltas))
            else:
                makedirs_safe(os.path.join(dir_out, LF0LabelGen.dir_lf0))
                makedirs_safe(os.path.join(dir_out, LF0LabelGen.dir_vuv))

        # Create the return dictionary if required.
        if return_dict:
            label_dict = OrderedDict()

        # Create normalisation computation units.
        norm_params_ext_lf0 = MeanStdDevExtractor()
        # norm_params_ext_vuv = MeanStdDevExtractor()
        norm_params_ext_deltas = MeanStdDevExtractor()

        logging.info("Extract WORLD LF0 features for " + "[{0}]".format(", ".join(str(i) for i in id_list)))
        for file_name in id_list:
            logging.debug("Extract WORLD LF0 features from " + file_name)

            # Load audio file and extract features.
            audio_name = os.path.join(dir_in, file_name + ".wav")
            raw, fs = soundfile.read(audio_name)
            _f0, t = pyworld.dio(raw, fs)  # Raw pitch extraction. TODO: Use magphase here?
            f0 = pyworld.stonemask(raw, _f0, t, fs)  # Pitch refinement.

            # Compute lf0 and vuv information.
            lf0 = np.log(f0, dtype=np.float32)
            lf0[lf0 <= math.log(LF0LabelGen.f0_silence_threshold)] = LF0LabelGen.lf0_zero
            lf0, vuv = interpolate_lin(lf0)

            if add_deltas:
                # Compute the deltas and double deltas for all features.
                lf0_deltas, lf0_double_deltas = compute_deltas(lf0)

                # Combine them to a single feature sample.
                labels = np.concatenate((lf0, lf0_deltas, lf0_double_deltas, vuv), axis=1)

                # Save into return dictionary and/or file.
                if return_dict:
                    label_dict[file_name] = labels
                if dir_out is not None:
                    labels.tofile(os.path.join(dir_out, LF0LabelGen.dir_deltas, file_name + LF0LabelGen.ext_deltas))

                # Add sample to normalisation computation unit.
                norm_params_ext_deltas.add_sample(labels)
            else:
                # Save into return dictionary and/or file.
                if return_dict:
                    label_dict[file_name] = np.concatenate((lf0, vuv), axis=1)
                if dir_out is not None:
                    lf0.tofile(os.path.join(dir_out, LF0LabelGen.dir_lf0, file_name + LF0LabelGen.ext_lf0))
                    vuv.astype(np.float32).tofile(os.path.join(dir_out, LF0LabelGen.dir_vuv, file_name + LF0LabelGen.ext_vuv))

                # Add sample to normalisation computation unit.
                norm_params_ext_lf0.add_sample(lf0)
                # norm_params_ext_vuv.add_sample(vuv)

        # Save mean and std dev of all features.
        if not add_deltas:
            norm_params_ext_lf0.save(os.path.join(dir_out, LF0LabelGen.dir_lf0, file_id_list_name))
            # norm_params_ext_vuv.save(os.path.join(dir_out, LF0LabelGen.dir_vuv, file_id_list_name))
        else:
            # Manually set vuv normalisation parameters before saving.
            norm_params_ext_deltas.sum_frames[-1] = 0.0  # Mean = 0.0
            norm_params_ext_deltas.sum_squared_frames[-1] = norm_params_ext_deltas.sum_length  # Variance = 1.0
            norm_params_ext_deltas.save(os.path.join(dir_out, LF0LabelGen.dir_deltas, file_id_list_name))

        # Get normalisation parameters.
        if not add_deltas:
            norm_lf0 = norm_params_ext_lf0.get_params()
            # norm_vuv = norm_params_ext_vuv.get_params()

            norm_first = np.concatenate((norm_lf0[0], (0.0,)), axis=0)
            norm_second = np.concatenate((norm_lf0[1], (1.0,)), axis=0)
        else:
            norm_first, norm_second = norm_params_ext_deltas.get_params()

        if return_dict:
            # Return dict of labels for all utterances.
            return label_dict, norm_first, norm_second
        else:
            return norm_first, norm_second
예제 #8
0
    def gen_data(dir_in,
                 dir_out=None,
                 file_id_list="",
                 id_list=None,
                 return_dict=False):
        """
        Prepare durations from HTK labels (forced-aligned).
        Each numpy array has the dimension num_phonemes x PhonemeDurationLabelGen.num_states (default num_state=5).

        :param dir_in:         Directory where the HTK label files are stored (usually named label_state_align).
        :param dir_out:        Main directory where the labels and normalisation parameters are saved to.
                               If None, labels are not saved.
        :param file_id_list:   Name of the file containing the ids. Normalisation parameters are saved using
                               this name to differentiate parameters between subsets.
        :param id_list:        The list of utterances to process.
                               Should have the form uttId1 \\n uttId2 \\n ...\\n uttIdN.
                               If None, all file in dir_in are used.
        :param return_dict:    If true, returns an OrderedDict of all samples as first output.
        :return:               Returns two normalisation parameters as tuple. If return_dict is True it returns
                               all processed labels in an OrderedDict followed by the two normalisation parameters.
        """

        # Fill file_id_list by .wav files in dir_in if not given and set an appropriate file_id_list_name.
        if id_list is None:
            id_list = list()
            filenames = glob.glob(os.path.join(dir_in, "*.wav"))
            for filename in filenames:
                id_list.append(os.path.splitext(os.path.basename(filename))[0])
            file_id_list_name = "all"
        else:
            file_id_list_name = os.path.splitext(
                os.path.basename(file_id_list))[0]
            id_list = [
                '{}'.format(os.path.basename(element)) for element in id_list
            ]  # Ignore full path.

        # Create directories in dir_out if it is given.
        if dir_out is not None:
            makedirs_safe(dir_out)

        # Create the return dictionary if required.
        if return_dict:
            label_dict = OrderedDict()

        # Create normalisation computation units.
        norm_params_ext_dur = MeanStdDevExtractor()

        logging.info("Extract phoneme durations for " +
                     "[{0}]".format(", ".join(str(i) for i in id_list)))
        for file_name in id_list:
            logging.debug("Extract phoneme durations from " + file_name)

            with open(
                    os.path.join(
                        dir_in,
                        file_name + PhonemeDurationLabelGen.ext_phonemes),
                    'r') as f:
                htk_labels = [line.rstrip('\n').split()[:2] for line in f]
                timings = np.array(
                    htk_labels, dtype=np.float32
                ) / PhonemeDurationLabelGen.min_phoneme_length
                dur = timings[:, 1] - timings[:, 0]
                dur = dur.reshape(
                    -1, PhonemeDurationLabelGen.num_states).astype(np.float32)

            if return_dict:
                label_dict[file_name] = dur
            if dir_out is not None:
                dur.tofile(
                    os.path.join(
                        dir_out,
                        file_name + PhonemeDurationLabelGen.ext_durations))

            # Add sample to normalisation computation unit.
            norm_params_ext_dur.add_sample(dur)

        # Save mean and std dev of all features.
        norm_params_ext_dur.save(os.path.join(dir_out, file_id_list_name))

        # Get normalisation parameters.
        norm_first, norm_second = norm_params_ext_dur.get_params()

        if return_dict:
            # Return dict of labels for all utterances.
            return label_dict, norm_first, norm_second
        else:
            return norm_first, norm_second
예제 #9
0
    def gen_data(self,
                 dir_in,
                 dir_out=None,
                 file_id_list=None,
                 id_list=None,
                 return_dict=False):
        """
        Prepare atom labels from wav files.
        If id_list is not None, only the ids listed there are generated, otherwise for each .wav file in the dir_in.
        Atoms are computed by the wcad algorithm. Examples with more than 70 atoms are rejected. One can create
        a new file_id_list by uncommenting the lines before the return statement. Nevertheless, the current file_id_list
        is not substituted by it. The algorithm also saves the extracted phrase component in dir_out/id_name.phrase,
        if dir_out is not None.

        :param dir_in:           Directory containing the original wav files.
        :param dir_out:          Directory where the labels are stored. If None, no labels are stored.
        :param file_id_list:     Name of the file containing the ids. Normalisation parameters are saved using
                                 this name to differentiate parameters between subsets.
        :param id_list:          The list of utterances to process.
                                 Should have the form uttId1 \\n uttId2 \\n ...\\n uttIdN.
                                 If None, all wav files in audio_dir are used.
        :param return_dict:      If True, returns an OrderedDict of all samples as first output.
        :return:                 Returns mean=0.0, std_dev, min, max of atoms.
        """

        # Fill file_id_list by .wav files in dir_in if not given and set an appropriate file_id_list_name.
        if id_list is None:
            id_list = list()
            filenames = glob.glob(os.path.join(dir_in, "*.wav"))
            for filename in filenames:
                id_list.append(os.path.splitext(os.path.basename(filename))[0])
            file_id_list_name = "all"
        else:
            file_id_list_name = os.path.splitext(
                os.path.basename(file_id_list))[0]

        if dir_out is not None:
            makedirs_safe(dir_out)

        if return_dict:
            label_dict = OrderedDict()

        mean_std_ext_atom = MeanStdDevExtractor()
        min_max_ext_atom = MinMaxExtractor()
        mean_std_ext_phrase = MeanStdDevExtractor()
        min_max_ext_phrase = MinMaxExtractor()

        # Compute Atoms.
        from wcad import WaveInput, PitchExtractor, MultiphraseExtractor, DictionaryGenerator, AtomExtrator, ModelCreator, ModelSaver, Params, Paths
        correct_utts = list()
        self.logger.info("Create atom labels for " +
                         "[{0}]".format(", ".join(str(i) for i in id_list)))
        for id_name in id_list:
            self.logger.debug("Create atom labels for " + id_name)

            # Wcad has to be called in its root directory, therefore a change dir operation is necessary.
            cwd = os.getcwd()
            os.chdir(self.wcad_root)
            args = [dir_in + "/" + id_name + ".wav", dir_out]
            print(args)
            params = Params()
            # Overwrite the possible theta values by selected values.
            params.local_atoms_thetas = self.theta_interval
            params.k = [self.k]
            # params.min_atom_amp = 0.1
            paths = Paths(args, params)
            # Start the extraction process.
            start_t = time.time()
            waveform = WaveInput(paths.wav, params).read()
            pitch = PitchExtractor(waveform, params, paths).compute()
            # Compute the phrase component.
            phrase = MultiphraseExtractor(pitch, waveform, params,
                                          paths).compute()
            phrase_curve = phrase.curve
            # Extract atroms.
            dictionary = DictionaryGenerator(params, paths).compute()
            atoms = AtomExtrator(waveform, pitch, phrase, dictionary, params,
                                 paths).compute()
            # Create a model.
            model = ModelCreator(phrase, atoms, pitch).compute()
            print(('Model created in %s seconds' % (time.time() - start_t)))
            # Save the atoms.
            ModelSaver(model, params, paths).save()
            os.chdir(cwd)

            # Check if output can be correct.
            possible_extraction_failure = False
            if len(atoms) < 50 and not any(a.amp > 10 for a in atoms):
                correct_utts.append(id_name)
            else:
                self.logger.warning("Possible fail of atom extractor for " +
                                    id_name + " (atoms: " + str(len(atoms)) +
                                    ", frames: " + str(len(phrase_curve)) +
                                    ", max: " +
                                    str(max(a.amp for a in atoms)) + ").")
                possible_extraction_failure = True

            atoms.sort(key=lambda x: x.position)
            # print_atoms(atoms)

            # Get audio length needed to trim the atoms.
            duration = self.get_audio_length(id_name, dir_in,
                                             self.frame_size_ms)

            # The algorithm generates a few atoms at negative positions,
            # pad them into the first atom at positive position.
            padded_amp = 0
            padded_theta = 0
            for idx, atom in enumerate(atoms):
                if atom.position < 0:
                    padded_amp += atom.amp
                    padded_theta += atom.theta
                else:
                    atoms[idx].amp += padded_amp  # Pad the amplitude.
                    atoms[idx].theta = (atoms[idx].theta +
                                        padded_theta) / (idx + 1)
                    del atoms[:idx]  # Remove the negative atoms from the list.
                    break
            # print_atoms(atoms)

            # The algorithm might also generate a few atoms beyond the last label,
            # pad them into the last label.
            padded_amp = 0
            padded_theta = 0
            for idx, atom in reversed(list(enumerate(atoms))):
                if atom.position * self.frame_size_ms > duration:
                    padded_amp += atom.amp
                    padded_theta += atom.theta
                else:
                    atoms[idx].amp += padded_amp
                    atoms[idx].theta = (atoms[idx].theta +
                                        padded_theta) / (len(atoms) - idx)
                    atoms = atoms[:-(len(atoms) - idx - 1)
                                  or None]  # Remove atoms beyond last label.
                    break
            # print_atoms(atoms)

            # Create a label for each frame (size of frame_size_ms) with amplitude and theta of contained atoms.
            np_atom_labels = AtomLabelGen.atoms_to_labels(
                atoms, self.theta_interval, int(duration / self.frame_size_ms))

            np_atom_amps = np.sum(np_atom_labels, axis=1)

            if not possible_extraction_failure:  # Only add successful extractions to mean and std_dev computation.
                mean_std_ext_atom.add_sample(
                    np_atom_amps[np_atom_amps[:, 0] != 0.0]
                )  # Only compute std_dev from atoms.
                min_max_ext_atom.add_sample(np_atom_amps)
                # mean_std_ext_phrase.add_sample(phrase_curve)
                # min_max_ext_phrase.add_sample(phrase_curve)

            if return_dict:
                label_dict[id_name] = np_atom_labels
            if dir_out is not None:
                # Save phrase, because it might be used in synthesis.
                phrase_curve.astype('float32').tofile(
                    os.path.join(dir_out, id_name + self.ext_phrase))

                # Save atoms binary (float32).
                np_atom_labels.astype('float32').tofile(
                    os.path.join(dir_out, id_name + self.ext_atoms))

                # Create a readable version of the atom data.
                # np.savetxt(os.path.join(dir_out, id_name + self.ext_atoms + ".txt"), np_atom_labels)

        # Manually set mean of atoms to 0, otherwise frames without atom will have an amplitude.
        if mean_std_ext_atom.sum_length > 0:  # Make sure at least one atom was added.
            mean_std_ext_atom.sum_frames[:] = 0.0
        else:
            mean_std_ext_atom.sum_frames = np.zeros(np_atom_amps.shape[1:])
            mean_std_ext_atom.sum_squared_frames = np.zeros(
                np_atom_amps.shape[1:])
        mean_std_ext_atom.sum_squared_frames[
            1] = mean_std_ext_atom.sum_length * self.theta_interval[-1]

        mean_std_ext_atom.save(os.path.join(dir_out, file_id_list_name))
        min_max_ext_atom.save(os.path.join(dir_out, file_id_list_name))
        # mean_std_ext_phrase.save(os.path.join(dir_out, file_id_list_name + '-phrase'))
        # min_max_ext_phrase.save(os.path.join(dir_out, file_id_list_name + '-phrase'))

        mean_atoms, std_atoms = mean_std_ext_atom.get_params()
        min_atoms, max_atoms = min_max_ext_atom.get_params()
        # mean_phrase, std_phrase = mean_std_ext_phrase.get_params()
        # min_phrase, max_phrase = min_max_ext_atom.get_params()

        # Use this block to save the part of the file_id_list for which atom extraction was successful into a new file.
        if correct_utts:
            with open(
                    os.path.join(
                        os.path.dirname(dir_in), "wcad_" +
                        os.path.basename(file_id_list_name) + ".txt"),
                    'w') as f:
                f.write('\n'.join(correct_utts) + '\n')

        if return_dict:
            # Return dict of labels for all utterances.
            return label_dict, \
                   mean_atoms, std_atoms, \
                   min_atoms, max_atoms
            # mean_phrase, std_phrase, \
            # min_phrase, max_phrase
        else:
            return mean_atoms, std_atoms, \
                   min_atoms, max_atoms
예제 #10
0
    def gen_data(dir_in,
                 dir_out=None,
                 file_id_list="",
                 id_list=None,
                 label_type="full_state_align",
                 return_dict=False):
        """
        Prepare durations from HTK labels (forced-aligned).
        Each numpy array has the dimension
        num_phonemes x PhonemeDurationLabelGen.num_states (default
        num_state=5).

        :param dir_in:         Directory where the HTK label files are
                               stored (usually named label_state_align).
        :param dir_out:        Main directory where the labels and
                               normalisation parameters are saved to.
                               If None, labels are not saved.
        :param file_id_list:   Name of the file containing the ids.
                               Normalisation parameters are saved using
                               this name to differentiate parameters
                               between subsets.
        :param id_list:        The list of utterances to process. Should
                               have the form uttId1 \\n uttId2 \\n ...
                               \\n uttIdN. If None, all file in dir_in
                               are used.
        :param return_dict:    If true, returns an OrderedDict of all
                               samples as first output.
        :return:               Returns two normalisation parameters as
                               tuple. If return_dict is True it returns
                               all processed labels in an OrderedDict
                               followed by the two normalisation
                               parameters.
        """

        # Fill file_id_list by .wav files in dir_in if not given and set
        # an appropriate file_id_list_name.
        if id_list is None:
            id_list = list()
            filenames = glob.glob(os.path.join(dir_in, "*.wav"))
            for filename in filenames:
                id_list.append(os.path.splitext(os.path.basename(filename))[0])
            file_id_list_name = "all"
        else:
            file_id_list_name = os.path.splitext(
                os.path.basename(file_id_list))[0]
            id_list = [
                '{}'.format(os.path.basename(element)) for element in id_list
            ]  # Ignore full path.

        if dir_out is not None:
            makedirs_safe(dir_out)

        if return_dict:
            label_dict = OrderedDict()

        norm_params_ext_dur = MeanStdDevExtractor()

        logging.info("Extract phoneme durations for " +
                     "[{0}]".format(", ".join(str(i) for i in id_list)))
        for file_name in id_list:
            logging.debug("Extract phoneme durations from " + file_name)

            if label_type == "full_state_align":
                file_path = os.path.join(dir_in, file_name + ".lab")
                dur = PhonemeDurationLabelGen._get_full_state_align_dur(
                    file_path, PhonemeDurationLabelGen.min_phoneme_length,
                    PhonemeDurationLabelGen.num_states)
            elif label_type == "mfa":
                file_path = os.path.join(dir_in, file_name + ".TextGrid")
                dur = PhonemeDurationLabelGen._get_mfa_dur(
                    file_path, PhonemeDurationLabelGen.frame_length_sec)
            else:
                raise NotImplementedError(
                    "Unknown label type {}.".format(label_type))

            if return_dict:
                label_dict[file_name] = dur
            if dir_out is not None:
                file_path = os.path.join(dir_out, file_name)
                LabelGen._save_to_npz(file_path, dur, "dur")

            norm_params_ext_dur.add_sample(dur)

        norm_params_ext_dur.save(os.path.join(dir_out, file_id_list_name))

        norm_first, norm_second = norm_params_ext_dur.get_params()

        if return_dict:
            return label_dict, norm_first, norm_second
        else:
            return norm_first, norm_second