Exemplo n.º 1
0
def create_utterances(festival_dir,
                      file_ids,
                      sentences,
                      out_dir,
                      custom_voice=None):
    festival_exe = os.path.join(festival_dir, 'bin', 'festival')
    scm_commands = [f'#!{festival_exe}']

    if custom_voice is not None:
        # Run Festival with a particular voice.
        scm_commands.append(f'(voice_{custom_voice})')

    scm_command_str = '(utt.save (utt.synth (Utterance Text "{sentence}" )) "{utt_file}")'

    for file_id, sentence in zip(file_ids, sentences):
        utt_file = os.path.join(out_dir, 'utts', f'{file_id}.utt')

        scm_commands.append(
            scm_command_str.format(sentence=sentence, utt_file=utt_file))

    # Save the commands.
    gen_utts_scm_file = os.path.join(out_dir, 'gen_utts.scm')
    file_io.save_lines(scm_commands, gen_utts_scm_file)

    # If the file_ids are paths (e.g. for multi-speaker data), make sure the directory structure is already in place.
    utils.make_dirs(os.path.join(out_dir, 'utts'), file_ids)

    # Run the commands.
    scm_file = os.path.join(out_dir, 'gen_utts.scm')
    # Argument `check=True` ensures that an exception is raised if the process' return code is non-zero.
    subprocess.run([festival_exe, '-b', scm_file], check=True)
Exemplo n.º 2
0
def process(lab_dir, id_list, out_dir, state_level):
    """Processes label files in id_list, saves the phone identities (as a string) to text files.

    Args:
        lab_dir (str): Directory containing the label files.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
        state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level.
    """
    file_ids = utils.get_file_ids(id_list=id_list)

    utils.make_dirs(os.path.join(out_dir, 'phones'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'n_phones'), file_ids)

    for file_id in file_ids:
        # Label processing.
        lab_path = os.path.join(lab_dir, f'{file_id}.lab')
        label = lab_to_feat.Label(lab_path, state_level)

        phones = label.phones
        n_phones = len(label.phones)

        file_io.save_lines(phones,
                           os.path.join(out_dir, 'phones', f'{file_id}.txt'))
        file_io.save_txt(n_phones,
                         os.path.join(out_dir, 'n_phones', f'{file_id}.txt'))
Exemplo n.º 3
0
    def save_file(self, data, base_name, data_dir):
        r"""Saves text as a text file.

        Parameters
        ----------
        data : list<str>
            Sequence of strings.
        base_name : str
            The name (without extensions) of the file to be loaded.
        data_dir : str
            The directory containing all feature types for this dataset.
        """
        file_path = self.file_path(base_name, data_dir)
        file_io.save_lines(data, file_path)
Exemplo n.º 4
0
    def _full_to_mono(self,
                      full_file_name,
                      mono_file_name,
                      current_phone_regex=re.compile('-(.+?)\+')):
        phones = []

        label = file_io.load_lines(full_file_name)
        for line in label:
            phone = current_phone_regex.search(line).group(1)
            phones.append(phone)

        file_io.save_lines(phones, mono_file_name)

        return phones
Exemplo n.º 5
0
    def _add_alignments_to_lab(self, mlf, lab_align_dir, lab_dir, file_ids):
        make_dirs(lab_align_dir, file_ids)

        with open(mlf, 'r') as f:
            # Consume the MLF #!header!# line.
            _ = f.readline()

            for file_id in file_ids:
                # Consume the file name line.
                line = f.readline()

                mlf_base_name = os.path.splitext(os.path.basename(line))[0]
                id_base_name = os.path.basename(file_id)

                if mlf_base_name != id_base_name:
                    raise ValueError(
                        f'The file order in the mlf ({mlf}) does not match file_ids)\n'
                        f'{mlf_base_name} {id_base_name}')

                label_no_align = file_io.load_lines(
                    os.path.join(lab_dir, f'{file_id}.lab'))

                label_state_align = []
                for label_tag in label_no_align:
                    label_tag = label_tag.strip()

                    for i in range(STATES_PER_PHONE):
                        # Consume a state alignment line.
                        line = f.readline().strip()

                        # Get the alignments for this state.
                        start_time, end_time, *_ = line.split()
                        label_state_align.append(
                            f'{start_time} {end_time} {label_tag}[{i + 2}]')

                # label_state_align
                file_io.save_lines(
                    label_state_align,
                    os.path.join(lab_align_dir, f'{file_id}.lab'))

                # Consume the end of file line marker ('.' character).
                line = f.readline().strip()

                if line != '.':
                    raise ValueError('The two files are not matched!')
Exemplo n.º 6
0
    def make_scp(self, file_ids):
        wav_paths = []
        lab_paths = []
        mfc_paths = []

        for file_id in file_ids:
            wav_paths.append(os.path.join(self.wav_dir, f'{file_id}.wav'))
            lab_paths.append(os.path.join(self.lab_dir, f'{file_id}.lab'))

            # HVite requires a flat directory structure, so mfc files use the base_name of file_id.
            base_name = os.path.basename(file_id)
            mfc_paths.append(os.path.join(self.mfc_dir, f'{base_name}.mfc'))

        file_io.save_lines(map(' '.join, zip(wav_paths, mfc_paths)),
                           self.copy_scp)
        file_io.save_lines(mfc_paths, self.train_scp)

        return wav_paths, lab_paths, mfc_paths
Exemplo n.º 7
0
def sanitise_labs(lab_dir,
                  file_ids,
                  label_out_dir,
                  include_times=False,
                  state_level=False,
                  is_mono=False):

    utils.make_dirs(label_out_dir, file_ids)

    for file_id in file_ids:
        label = file_io.load_lines(os.path.join(lab_dir, f'{file_id}.lab'))
        n_phones = len(label)

        start_times, end_times, label = map(list, zip(*map(str.split, label)))
        start_times, end_times, label = sanitise_silences(start_times,
                                                          end_times,
                                                          label,
                                                          is_mono=is_mono)

        if state_level:
            if include_times:
                n_states = n_phones * STATES_PER_PHONE

                times = np.interp(range(0, n_states + 1, 1),
                                  range(0, n_states + 1, STATES_PER_PHONE),
                                  start_times + end_times[-1:])

                start_times = times[:-1]
                end_times = times[1:]

            label = np.repeat(label, STATES_PER_PHONE).tolist()
            for i in range(len(label)):
                state_idx = i % STATES_PER_PHONE
                label[i] += f'[{state_idx+2}]'

        if include_times:
            start_times = list(map(_round_dur, start_times))
            end_times = list(map(_round_dur, end_times))

            label = list(map(' '.join, zip(*[start_times, end_times, label])))

        file_io.save_lines(label, os.path.join(label_out_dir,
                                               f'{file_id}.lab'))
Exemplo n.º 8
0
    def full_to_mono(self, file_ids):
        phone_set = set()

        for file_id in file_ids:
            base_name = os.path.basename(file_id)
            lab_file = os.path.join(self.lab_dir, f'{file_id}.lab')
            # HVite requires a flat directory structure, so mono-lab files use the base_name of file_id.
            mono_lab_file = os.path.join(self.mono_lab_dir, f'{base_name}.lab')

            phones = self._full_to_mono(lab_file, mono_lab_file)
            phone_set.update(phones)

        file_io.save_lines(phone_set, self.phonemes)
        file_io.save_lines(map(' '.join, zip(phone_set, phone_set)),
                           self.phoneme_map)

        with open(self.phoneme_mlf, 'w') as f:
            f.write('#!MLF!#\n')
            f.write(f'"*/*.lab" => "{self.mono_lab_dir}"\n')
Exemplo n.º 9
0
def process_file(festival_dir, txt_file, out_dir, custom_voice=None):
    """Create Utterance structures for all sentences in `txt_file` and save them to `out_dir`.

    Args:
        festival_dir (str): Directory containing festival installation.
        txt_file (str): File containing all transcriptions, with the following schema,
            (file_id, "sentence transcription")*
        out_dir (str): Directory to save the output to.
    """
    line_regex = re.compile(r'\(\s*'
                            r'(?P<file_id>.+)'
                            r'\s+'
                            r'"(?P<sentence>.+)"'
                            r'\s*\)')

    file_ids = []
    sentences = []

    # For all lines in txt_file extract file_id + sentence and add a command to create and save the Utterance structure.
    for line in file_io.load_lines(txt_file):

        match = re.match(line_regex, line)
        if match is None:
            print(f'Match not found for the following line,\n{line}')
            continue

        file_id = match.group('file_id')
        file_ids.append(file_id)

        sentence = match.group('sentence')
        sentence = sentence.replace('"', '\\"')
        sentences.append(sentence)

    # Save the file_ids.
    file_io.save_lines(file_ids, os.path.join(out_dir, 'file_id_list.scp'))

    # Create and save the Utterance structures.
    create_utterances(festival_dir,
                      file_ids,
                      sentences,
                      out_dir,
                      custom_voice=custom_voice)
Exemplo n.º 10
0
def process(lab_dir, wav_dir, id_list, out_dir, state_level, question_file,
            upsample, subphone_feat_type, trim_silences,
            calculate_normalisation, normalisation_of_deltas):
    """Processes wav files in id_list, saves the log-F0 and MVN parameters to files.

    Args:
        lab_dir (str): Directory containing the label files.
        wav_dir (str): Directory containing the wav files.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
        state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level.
        question_file (str): Question set to be loaded. Can be one of the four provided question sets;
                questions-unilex_dnn_600.hed
                questions-unilex_phones_69.hed
                questions-radio_dnn_416.hed
                questions-radio_phones_48.hed
                questions-mandarin.hed
                questions-japanese.hed
        upsample (bool): Whether to upsample phone-level numerical labels to frame-level.
        subphone_feat_type (str): Subphone features to be extracted from the durations.
        trim_silences (bool): Whether to trim start and end silences from all features.
        calculate_normalisation (bool): Whether to automatically calculate MVN parameters after extracting F0.
        normalisation_of_deltas (bool): Also calculate the MVN parameters for the delta and delta delta features.
    """
    file_ids = utils.get_file_ids(id_list=id_list)

    question_set = lab_to_feat.QuestionSet(question_file)
    subphone_feature_set = lab_to_feat.SubphoneFeatureSet(subphone_feat_type)

    utils.make_dirs(os.path.join(out_dir, 'lab'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'counters'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'dur'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'phones'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'n_frames'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'n_phones'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'lf0'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'vuv'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'mcep'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'bap'), file_ids)

    for file_id in tqdm(file_ids):
        # Label processing.
        lab_path = os.path.join(lab_dir, f'{file_id}.lab')
        label = lab_to_feat.Label(lab_path, state_level)

        numerical_labels = label.extract_numerical_labels(
            question_set, upsample_to_frame_level=upsample)
        counter_features = label.extract_counter_features(subphone_feature_set)
        durations = label.phone_durations.reshape((-1, 1))
        phones = label.phones

        n_frames = np.sum(durations).item()
        n_phones = len(label.phones)

        # Acoustic processing.
        wav_path = os.path.join(wav_dir, f'{file_id}.wav')
        wav, sample_rate = file_io.load_wav(wav_path)

        f0, vuv, mcep, bap = world_with_reaper_f0.analysis(wav, sample_rate)
        lf0 = np.log(f0)

        # Match the number of frames between label forced-alignment and vocoder analysis.
        # Often the durations from forced alignment are a few frames longer than the vocoder features.
        diff = n_frames - f0.shape[0]
        if diff > n_phones:
            raise ValueError(
                f'Number of label frames and vocoder frames is too different for {file_id}\n'
                f'\tlabel frames {n_frames}\n'
                f'\tvocoder frames {f0.shape[0]}\n'
                f'\tnumber of phones {n_phones}')

        # Remove excess durations if there is a shape mismatch.
        if diff > 0:
            # Remove 1 frame from each phone's duration starting at the end of the sequence.
            durations[-diff:] -= 1
            n_frames = f0.shape[0]
            print(
                f'Cropped {diff} frames from durations for utterance {file_id}'
            )

        assert n_frames == np.sum(durations).item()

        trim_frame_slice = slice(0, n_frames)
        if trim_silences:

            start_phone_idx, end_phone_idx = 0, n_phones
            start_frame_idx, end_frame_idx = 0, n_frames
            if phones[0] in ['sil', '#']:
                start_phone_idx += 1
                start_frame_idx += durations[0]
            if phones[-1] in ['sil', '#']:
                end_phone_idx -= 1
                end_frame_idx -= durations[-1]

            trim_phone_slice = slice(int(start_phone_idx), int(end_phone_idx))
            trim_frame_slice = slice(int(start_frame_idx), int(end_frame_idx))

            numerical_labels = numerical_labels[
                trim_frame_slice if upsample else trim_phone_slice]
            durations = durations[trim_phone_slice]
            phones = phones[trim_phone_slice]

            n_frames = trim_frame_slice.stop - trim_frame_slice.start
            n_phones = trim_phone_slice.stop - trim_phone_slice.start

        counter_features = counter_features[trim_frame_slice]
        lf0 = lf0[trim_frame_slice]
        vuv = vuv[trim_frame_slice]
        mcep = mcep[trim_frame_slice]
        bap = bap[trim_frame_slice]

        file_io.save_bin(numerical_labels.astype(np.float32),
                         os.path.join(out_dir, 'lab', f'{file_id}.npy'))
        file_io.save_bin(counter_features.astype(np.float32),
                         os.path.join(out_dir, 'counters', f'{file_id}.npy'))
        file_io.save_txt(durations,
                         os.path.join(out_dir, 'dur', f'{file_id}.txt'))
        file_io.save_lines(phones,
                           os.path.join(out_dir, 'phones', f'{file_id}.txt'))

        file_io.save_txt(n_frames,
                         os.path.join(out_dir, 'n_frames', f'{file_id}.txt'))
        file_io.save_txt(n_phones,
                         os.path.join(out_dir, 'n_phones', f'{file_id}.txt'))

        file_io.save_bin(lf0.astype(np.float32),
                         os.path.join(out_dir, 'lf0', f'{file_id}.npy'))
        file_io.save_bin(vuv, os.path.join(out_dir, 'vuv', f'{file_id}.npy'))
        file_io.save_bin(mcep.astype(np.float32),
                         os.path.join(out_dir, 'mcep', f'{file_id}.npy'))
        file_io.save_bin(bap.astype(np.float32),
                         os.path.join(out_dir, 'bap', f'{file_id}.npy'))

    if calculate_normalisation:
        process_minmax(out_dir, 'lab', id_list, out_dir=out_dir)
        process_minmax(out_dir, 'counters', id_list, out_dir=out_dir)
        process_mvn(out_dir,
                    'dur',
                    is_npy=False,
                    id_list=id_list,
                    deltas=False,
                    out_dir=out_dir)

        process_mvn(out_dir,
                    'lf0',
                    id_list=id_list,
                    deltas=normalisation_of_deltas,
                    out_dir=out_dir)
        process_mvn(out_dir,
                    'mcep',
                    id_list=id_list,
                    deltas=normalisation_of_deltas,
                    out_dir=out_dir)
        process_mvn(out_dir,
                    'bap',
                    id_list=id_list,
                    deltas=normalisation_of_deltas,
                    out_dir=out_dir)
Exemplo n.º 11
0
def process(lab_dir, id_list, out_dir, state_level, question_file, upsample,
            subphone_feat_type, calculate_normalisation):
    """Processes label files in id_list, saves the numerical labels and durations to file.

    Args:
        lab_dir (str): Directory containing the label files.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
        state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level.
        question_file (str): Question set to be loaded. Can be one of the four provided question sets;
                questions-unilex_dnn_600.hed
                questions-unilex_phones_69.hed
                questions-radio_dnn_416.hed
                questions-radio_phones_48.hed
                questions-mandarin.hed
                questions-japanese.hed
        upsample (bool): Whether to upsample phone-level numerical labels to frame-level.
        subphone_feat_type (str): Subphone features to be extracted from the durations.
        calculate_normalisation (bool): Calculate mean-variance and min-max normalisation for duration and labels.
    """
    file_ids = get_file_ids(id_list=id_list)
    question_set = QuestionSet(question_file)
    subphone_feature_set = SubphoneFeatureSet(subphone_feat_type)

    make_dirs(os.path.join(out_dir, 'lab'), file_ids)
    make_dirs(os.path.join(out_dir, 'counters'), file_ids)
    make_dirs(os.path.join(out_dir, 'dur'), file_ids)
    make_dirs(os.path.join(out_dir, 'phones'), file_ids)
    make_dirs(os.path.join(out_dir, 'n_frames'), file_ids)
    make_dirs(os.path.join(out_dir, 'n_phones'), file_ids)

    for file_id in file_ids:
        lab_path = os.path.join(lab_dir, f'{file_id}.lab')
        label = Label(lab_path, state_level)

        numerical_labels = label.extract_numerical_labels(
            question_set, upsample_to_frame_level=upsample)
        counter_features = label.extract_counter_features(subphone_feature_set)
        durations = label.phone_durations.reshape((-1, 1))
        phones = label.phones

        n_frames = np.sum(durations).item()
        n_phones = len(label.phones)

        file_io.save_bin(numerical_labels,
                         os.path.join(out_dir, 'lab', f'{file_id}.npy'))
        file_io.save_bin(counter_features,
                         os.path.join(out_dir, 'counters', f'{file_id}.npy'))
        file_io.save_txt(durations,
                         os.path.join(out_dir, 'dur', f'{file_id}.dur'))
        file_io.save_lines(phones,
                           os.path.join(out_dir, 'phones', f'{file_id}.txt'))

        file_io.save_txt(n_frames,
                         os.path.join(out_dir, 'n_frames', f'{file_id}.txt'))
        file_io.save_txt(n_phones,
                         os.path.join(out_dir, 'n_phones', f'{file_id}.txt'))

    if calculate_normalisation:
        process_minmax(out_dir, 'lab', id_list)
        process_minmax(out_dir, 'counters', id_list)
        process_mvn(out_dir,
                    'dur',
                    is_npy=False,
                    id_list=id_list,
                    deltas=False)
Exemplo n.º 12
0
    def train_hmm(self, niter, num_mix, num_splits=1):
        """
        Perform one or more rounds of estimation
        """
        print('---training HMM models')

        if num_splits != 1:
            # Call HERest in multiple chunks, split scp in num_splits chunks and save them.
            print(f'----num_splits set to {num_splits}')

            train_scp_chunks = []

            mfc_files = file_io.load_lines(self.train_scp)
            random.shuffle(mfc_files)

            n = (len(mfc_files) + 1) // num_splits
            mfc_chunks = [
                mfc_files[j:j + n] for j in range(0, len(mfc_files), n)
            ]

            for i, mfc_chunk in enumerate(mfc_chunks):
                train_scp_chunk = os.path.join(self.cfg_dir, f'train_{i}.scp')
                train_scp_chunks.append(train_scp_chunk)

                file_io.save_lines(mfc_chunk, train_scp_chunk)

        done = 0
        mix = 1
        while mix <= num_mix and done == 0:
            for i in range(niter):
                next_dir = os.path.join(self.model_dir,
                                        f'hmm_mix_{mix}_iter_{i+1}')
                if not os.path.exists(next_dir):
                    os.makedirs(next_dir)

                if num_splits == 1:
                    subprocess.run([
                        self.HERest, '-C', self.cfg, '-S', self.train_scp,
                        '-I', self.phoneme_mlf, '-M', next_dir, '-H',
                        os.path.join(self.cur_dir, MACROS), '-H',
                        os.path.join(self.cur_dir,
                                     HMMDEFS), '-t', *PRUNING, self.phonemes
                    ],
                                   stdout=subprocess.PIPE,
                                   check=True)
                else:
                    procs = []
                    # Estimate per chunk.
                    for chunk_num in range(len(train_scp_chunks)):
                        procs.append(
                            subprocess.Popen([
                                self.HERest, '-C', self.cfg, '-S',
                                train_scp_chunks[chunk_num], '-I',
                                self.phoneme_mlf, '-M', next_dir, '-H',
                                os.path.join(self.cur_dir, MACROS), '-H',
                                os.path.join(self.cur_dir,
                                             HMMDEFS), '-t', *PRUNING, '-p',
                                str(chunk_num + 1), self.phonemes
                            ],
                                             stdout=subprocess.PIPE))

                    # Wait until all HERest calls are finished.
                    for p in procs:
                        p.wait()

                    # Now accumulate.
                    subprocess.run([
                        self.HERest, '-C', self.cfg, '-M', next_dir, '-H',
                        os.path.join(self.cur_dir, MACROS), '-H',
                        os.path.join(self.cur_dir,
                                     HMMDEFS), '-t', *PRUNING, '-p', '0',
                        self.phonemes, *glob.glob(next_dir + os.sep + "*.acc")
                    ],
                                   stdout=subprocess.PIPE,
                                   check=True)

                self.cur_dir = next_dir

            if mix * 2 <= num_mix:
                # Increase mixture number.
                hed_file = os.path.join(self.cfg_dir, f'mix_{mix * 2}.hed')
                with open(hed_file, 'w') as f:
                    f.write(
                        f'MU {mix * 2} {{*.state[2-{STATES_PER_PHONE + 2}].mix}}\n'
                    )

                next_dir = os.path.join(self.model_dir,
                                        f'hmm_mix_{mix * 2}_iter_0')
                os.makedirs(next_dir, exist_ok=True)

                subprocess.run([
                    self.HHEd, '-A', '-H',
                    os.path.join(self.cur_dir, MACROS), '-H',
                    os.path.join(self.cur_dir, HMMDEFS), '-M', next_dir,
                    hed_file, self.phonemes
                ],
                               check=True)

                self.cur_dir = next_dir
                mix *= 2

            else:
                done = 1