def process_dir(festival_dir, txt_dir, id_list, out_dir, custom_voice=None): """Create Utterance structures for all sentences in `id_list` and save them to `out_dir`. Args: festival_dir (str): Directory containing festival installation. txt_dir (str): Directory containing text transcriptions. id_list (str): List of file basenames to process. out_dir (str): Directory to save the output to. """ file_ids = utils.get_file_ids(id_list=id_list) sentences = [] # For all file_ids load the sentence and add a command to create and save the Utterance structure. for file_id in sorted(file_ids): sentence = file_io.load_lines(os.path.join(txt_dir, f'{file_id}.txt'))[0] sentence = sentence.replace('"', '\\"') sentences.append(sentence) # If the file_ids are paths (e.g. for multi-speaker data), make sure the directory structure is already in place. utils.make_dirs(os.path.join(out_dir, 'utts'), file_ids) # Create and save the Utterance structures. create_utterances(festival_dir, file_ids, sentences, out_dir, custom_voice=custom_voice)
def plot_repeated_batch_f0(features, *repeated_predictions, out_dir=None): if out_dir is not None: plots_dir = os.path.join(out_dir, 'plots', 'repeated_f0') make_dirs(plots_dir, features['name']) n_frames = features['n_frames'].cpu().detach().numpy() target_f0, target_vuv = utils.detach_batched_seqs(torch.exp( features['lf0']), features['vuv'], seq_len=n_frames) repeated_pred_f0 = utils.detach_batched_seqs( *(torch.exp(predicted['lf0']) for predicted in repeated_predictions), seq_len=n_frames) for i, name in enumerate(features['name']): if out_dir is None: out_file = None else: out_file = os.path.join(plots_dir, f'{name}.pdf') plot_repeated_f0(target_f0[i], target_vuv[i], *(pred_f0[i] for pred_f0 in repeated_pred_f0), name=name, out_file=out_file)
def process(wav_dir, id_list, out_dir, calculate_normalisation, normalisation_of_deltas): """Processes wav files in id_list, saves the log-F0 and MVN parameters to files. Args: wav_dir (str): Directory containing the wav files. id_list (str): List of file basenames to process. out_dir (str): Directory to save the output to. calculate_normalisation (bool): Whether to automatically calculate MVN parameters after extracting F0. normalisation_of_deltas (bool): Also calculate the MVN parameters for the delta and delta delta features. """ file_ids = get_file_ids(wav_dir, id_list) make_dirs(os.path.join(out_dir, 'lf0'), file_ids) make_dirs(os.path.join(out_dir, 'vuv'), file_ids) for file_id in file_ids: wav_path = os.path.join(wav_dir, f'{file_id}.wav') wav, sample_rate = file_io.load_wav(wav_path) f0, vuv = analysis(wav, sample_rate) lf0 = np.log(f0) file_io.save_bin(lf0, os.path.join(out_dir, 'lf0', f'{file_id}.npy')) file_io.save_bin(vuv, os.path.join(out_dir, 'vuv', f'{file_id}.npy')) if calculate_normalisation: process_mvn(out_dir, 'lf0', id_list=id_list, deltas=normalisation_of_deltas)
def create_utterances(festival_dir, file_ids, sentences, out_dir, custom_voice=None): festival_exe = os.path.join(festival_dir, 'bin', 'festival') scm_commands = [f'#!{festival_exe}'] if custom_voice is not None: # Run Festival with a particular voice. scm_commands.append(f'(voice_{custom_voice})') scm_command_str = '(utt.save (utt.synth (Utterance Text "{sentence}" )) "{utt_file}")' for file_id, sentence in zip(file_ids, sentences): utt_file = os.path.join(out_dir, 'utts', f'{file_id}.utt') scm_commands.append( scm_command_str.format(sentence=sentence, utt_file=utt_file)) # Save the commands. gen_utts_scm_file = os.path.join(out_dir, 'gen_utts.scm') file_io.save_lines(scm_commands, gen_utts_scm_file) # If the file_ids are paths (e.g. for multi-speaker data), make sure the directory structure is already in place. utils.make_dirs(os.path.join(out_dir, 'utts'), file_ids) # Run the commands. scm_file = os.path.join(out_dir, 'gen_utts.scm') # Argument `check=True` ensures that an exception is raised if the process' return code is non-zero. subprocess.run([festival_exe, '-b', scm_file], check=True)
def process(lab_dir, id_list, out_dir, state_level): """Processes label files in id_list, saves the phone identities (as a string) to text files. Args: lab_dir (str): Directory containing the label files. id_list (str): List of file basenames to process. out_dir (str): Directory to save the output to. state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level. """ file_ids = utils.get_file_ids(id_list=id_list) utils.make_dirs(os.path.join(out_dir, 'phones'), file_ids) utils.make_dirs(os.path.join(out_dir, 'n_phones'), file_ids) for file_id in file_ids: # Label processing. lab_path = os.path.join(lab_dir, f'{file_id}.lab') label = lab_to_feat.Label(lab_path, state_level) phones = label.phones n_phones = len(label.phones) file_io.save_lines(phones, os.path.join(out_dir, 'phones', f'{file_id}.txt')) file_io.save_txt(n_phones, os.path.join(out_dir, 'n_phones', f'{file_id}.txt'))
def plot_batch_f0(features, predicted, use_vuv=None, out_dir=None): if use_vuv is None: use_vuv = 'vuv' in predicted if out_dir is not None: plots_dir = os.path.join(out_dir, 'plots', 'f0') make_dirs(plots_dir, features['name']) n_frames = features['n_frames'].cpu().detach().numpy() target_f0, target_vuv = utils.detach_batched_seqs(torch.exp( features['lf0']), features['vuv'], seq_len=n_frames) pred_f0 = utils.detach_batched_seqs(torch.exp(predicted['lf0']), seq_len=n_frames) if use_vuv: pred_vuv = utils.detach_batched_seqs(predicted['vuv'] > 0.5, seq_len=n_frames) else: pred_vuv = [None] * len(pred_f0) for i, name in enumerate(features['name']): if out_dir is None: out_file = None else: out_file = os.path.join(plots_dir, f'{name}.pdf') plot_f0(target_f0[i], target_vuv[i], pred_f0[i], pred_vuv[i], name=name, out_file=out_file)
def save_dir(save_fn, path, data, file_ids, feat_ext=None): utils.make_dirs(path, file_ids) for datum, file_id in zip(data, file_ids): if feat_ext is not None: file_id = f'{file_id}.{feat_ext}' file_path = os.path.join(path, file_id) save_fn(datum, file_path)
def cluster(embeddings, n_clusters, names=None, out_dir=None): """Processes wav files in id_list, saves the log-F0 and MVN parameters to files. Args: embeddings_dir (str): Directory containing the embedding files. n_clusters (int): Number of clusters for k-means. names (str): List of file basenames to process. out_dir (str): Directory to save the output to. """ if out_dir is not None: if names is None: raise ValueError( 'If `out_dir` is given, then `names` of individual sentences must also be given' ) centres_path = os.path.join(out_dir, 'k_means', 'clusters') make_dirs(centres_path, names) assignments_path = os.path.join(out_dir, 'k_means', 'cluster_assignments') make_dirs(assignments_path, names) # Cluster with k-means. kmeans = KMeans(n_clusters=n_clusters).fit(embeddings) cluster_centres = kmeans.cluster_centers_ cluster_assignments = kmeans.labels_ # Save the cluster assignments and clusters to files. if out_dir is not None: cluster_names = [f'cluster_{i}' for i in range(n_clusters)] file_io.save_dir(file_io.save_bin, centres_path, cluster_centres, cluster_names, feat_ext='npy') file_io.save_dir(file_io.save_txt, assignments_path, cluster_assignments, names, feat_ext='txt') counts = np.array([(i, cluster_assignments.reshape(-1).tolist().count(i)) for i in range(n_clusters)]) file_io.save_txt(counts, f'{assignments_path}_counts.txt') return cluster_centres, cluster_assignments
def _add_alignments_to_lab(self, mlf, lab_align_dir, lab_dir, file_ids): make_dirs(lab_align_dir, file_ids) with open(mlf, 'r') as f: # Consume the MLF #!header!# line. _ = f.readline() for file_id in file_ids: # Consume the file name line. line = f.readline() mlf_base_name = os.path.splitext(os.path.basename(line))[0] id_base_name = os.path.basename(file_id) if mlf_base_name != id_base_name: raise ValueError( f'The file order in the mlf ({mlf}) does not match file_ids)\n' f'{mlf_base_name} {id_base_name}') label_no_align = file_io.load_lines( os.path.join(lab_dir, f'{file_id}.lab')) label_state_align = [] for label_tag in label_no_align: label_tag = label_tag.strip() for i in range(STATES_PER_PHONE): # Consume a state alignment line. line = f.readline().strip() # Get the alignments for this state. start_time, end_time, *_ = line.split() label_state_align.append( f'{start_time} {end_time} {label_tag}[{i + 2}]') # label_state_align file_io.save_lines( label_state_align, os.path.join(lab_align_dir, f'{file_id}.lab')) # Consume the end of file line marker ('.' character). line = f.readline().strip() if line != '.': raise ValueError('The two files are not matched!')
def sanitise_labs(lab_dir, file_ids, label_out_dir, include_times=False, state_level=False, is_mono=False): utils.make_dirs(label_out_dir, file_ids) for file_id in file_ids: label = file_io.load_lines(os.path.join(lab_dir, f'{file_id}.lab')) n_phones = len(label) start_times, end_times, label = map(list, zip(*map(str.split, label))) start_times, end_times, label = sanitise_silences(start_times, end_times, label, is_mono=is_mono) if state_level: if include_times: n_states = n_phones * STATES_PER_PHONE times = np.interp(range(0, n_states + 1, 1), range(0, n_states + 1, STATES_PER_PHONE), start_times + end_times[-1:]) start_times = times[:-1] end_times = times[1:] label = np.repeat(label, STATES_PER_PHONE).tolist() for i in range(len(label)): state_idx = i % STATES_PER_PHONE label[i] += f'[{state_idx+2}]' if include_times: start_times = list(map(_round_dur, start_times)) end_times = list(map(_round_dur, end_times)) label = list(map(' '.join, zip(*[start_times, end_times, label]))) file_io.save_lines(label, os.path.join(label_out_dir, f'{file_id}.lab'))
def batch_synth(lf0, vuv, mcep, bap, seq_len=None, names=None, out_dir=None, sample_rate=16000): if out_dir is not None: if names is None: raise ValueError( 'If `out_dir` is given, then `names` of individual sentences must also be given' ) synth_dir = os.path.join(out_dir, 'synth') make_dirs(synth_dir, names) lf0, vuv, mcep, bap = utils.detach_batched_seqs(lf0, vuv, mcep, bap, seq_len=seq_len) wavs = [] for i, name in enumerate(names): f0_i = np.exp(lf0[i]) f0_i = savgol_filter(f0_i, 7, 1) if len(f0_i) >= 7 else f0_i wav = world.synthesis(f0_i, vuv[i], mcep[i], bap[i], sample_rate=sample_rate) wavs.append(wav) if out_dir is not None: wav_path = os.path.join(synth_dir, f'{names[i]}.wav') file_io.save_wav(wav, wav_path, sample_rate=sample_rate) return wavs
def dumps_to_labs(dump_dir, file_ids, label_out_dir, awk='label-full.awk'): if awk in pkg_resources.resource_listdir( 'tts_data_tools', os.path.join('resources', 'festival')): print( f'Using tts_data_tools resource from resources/festival for {awk}') awk = pkg_resources.resource_filename( 'tts_data_tools', os.path.join('resources', 'festival', awk)) utils.make_dirs(label_out_dir, file_ids) for file_id in file_ids: # Argument `check=True` ensures that an exception is raised if the process' return code is non-zero. rtn = subprocess.run( ['gawk', '-f', awk, os.path.join(dump_dir, f'{file_id}.txt')], check=True, stdout=subprocess.PIPE) # `stdout` was redirected with a pipe and stored in the return object `rtn` as a binary string. with open(os.path.join(label_out_dir, f'{file_id}.lab'), 'wb') as f: f.write(rtn.stdout)
def save_files(self, data, base_names, data_dir): utils.make_dirs(data_dir, base_names) for datum, base_name in zip(data, base_names): self.save_file(datum, base_name, data_dir)
def process(lab_dir, id_list, out_dir, state_level, lab_dir_with_pos, wav_dir): """Processes label files in id_list, saves the phone identities (as a string) to text files. Args: lab_dir (str): Directory containing the label files. id_list (str): List of file basenames to process. out_dir (str): Directory to save the output to. state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level. """ file_ids = utils.get_file_ids(id_list=id_list) utils.make_dirs(os.path.join(out_dir, 'segment_n_phones'), file_ids) utils.make_dirs(os.path.join(out_dir, 'segment_n_frames'), file_ids) utils.make_dirs(os.path.join(out_dir, 'n_segments'), file_ids) for file_id in file_ids: lab_path_with_pos = os.path.join(lab_dir_with_pos, f'{file_id}.lab') label_with_pos = file_io.load_lines(lab_path_with_pos) word_start_idxs, _ = get_word_idxs( label_with_pos, word_idx_sep=(r'@', r'\+'), phrase_idx_sep=(r'@', r'=')) pos_tags = get_pos_tags(label_with_pos, word_start_idxs) lab_path = os.path.join(lab_dir, f'{file_id}.lab') label = lab_to_feat.Label(lab_path, state_level) durations = label.phone_durations n_frames = np.sum(durations).item() n_phones = len(label.phones) word_start_idxs, word_end_idxs = get_word_idxs( label.labels, word_idx_sep=(r':', r'\+'), phrase_idx_sep=(r':', r'=')) try: segment_start_idxs, segment_end_idxs = segment_words(word_start_idxs, word_end_idxs, pos_tags) except (ValueError, IndexError) as e: print(f'{e}\n{file_id}') else: wav_path = os.path.join(wav_dir, f'{file_id}.wav') wav, sample_rate = file_io.load_wav(wav_path) f0, _, _, _ = world_with_reaper_f0.analysis(wav, sample_rate) # Match the number of frames between label forced-alignment and vocoder analysis. # Often the durations from forced alignment are a few frames longer than the vocoder features. diff = n_frames - f0.shape[0] if diff > n_phones: raise ValueError(f'Number of label frames and vocoder frames is too different for {file_id}\n' f'\tlabel frames {n_frames}\n' f'\tvocoder frames {f0.shape[0]}\n' f'\tnumber of phones {n_phones}') # Remove excess durations if there is a shape mismatch. if diff > 0: # Remove 1 frame from each phone's duration starting at the end of the sequence. durations[-diff:] -= 1 n_frames = f0.shape[0] print(f'Cropped {diff} frames from durations for utterance {file_id}') assert n_frames == np.sum(durations).item() segment_phone_lens = [] segment_frame_lens = [] for segment_start_idx, segment_end_idx in zip(segment_start_idxs, segment_end_idxs): segment_phone_lens.append(segment_end_idx - segment_start_idx) segment_frame_lens.append(sum(durations[segment_start_idx:segment_end_idx])) file_io.save_txt(segment_phone_lens, os.path.join(out_dir, 'segment_n_phones', f'{file_id}.txt')) file_io.save_txt(segment_frame_lens, os.path.join(out_dir, 'segment_n_frames', f'{file_id}.txt')) file_io.save_txt(len(segment_phone_lens), os.path.join(out_dir, 'n_segments', f'{file_id}.txt'))
def utts_to_dumps(dumpfeats_exe, utt_dir, file_ids, dump_dir, feature_level='Segment', extra_feats_scm='extra_feats.scm', label_feats='label.feats', custom_voice=None): if extra_feats_scm in pkg_resources.resource_listdir( 'tts_data_tools', os.path.join('resources', 'festival')): print( f'Using tts_data_tools resource from resources/festival for {extra_feats_scm}' ) extra_feats_scm = pkg_resources.resource_filename( 'tts_data_tools', os.path.join('resources', 'festival', extra_feats_scm)) if custom_voice is not None: # Create a temporary file, to which we can add a command to load the custom voice. extra_feats_scm_with_custom_voice = tempfile.NamedTemporaryFile( suffix='.scm') # Write an initial line to load the custom voice. extra_feats_scm_with_custom_voice.write(f'(voice_{custom_voice})\n') # Write the code from the original Scheme file. with open(extra_feats_scm, 'r') as f: scm_code = f.read() extra_feats_scm_with_custom_voice.write(scm_code) # Replace the file name with the name of the temporary file. extra_feats_scm = extra_feats_scm_with_custom_voice.name if label_feats in pkg_resources.resource_listdir( 'tts_data_tools', os.path.join('resources', 'festival')): print( f'Using tts_data_tools resource from resources/festival for {label_feats}' ) label_feats = pkg_resources.resource_filename( 'tts_data_tools', os.path.join('resources', 'festival', label_feats)) utils.make_dirs(dump_dir, file_ids) for file_id in file_ids: # Argument `check=True` ensures that an exception is raised if the process' return code is non-zero. subprocess.run([ dumpfeats_exe, '-eval', extra_feats_scm, '-relation', feature_level, '-feats', label_feats, '-output', os.path.join(dump_dir, f'{file_id}.txt'), os.path.join(utt_dir, f'{file_id}.utt') ], check=True) # Replace any '#' characters used for pauses with 'pau'. subprocess.run([ 'sed', '-i', '-e', 's/#/pau/g', *glob.glob('label_POS/label_phone_align/dump/*') ], check=True) if custom_voice is not None: # Make sure to close the temporary file, this ensures it gets deleted. extra_feats_scm_with_custom_voice.close()
def process(lab_dir, wav_dir, id_list, out_dir, state_level, question_file, upsample, subphone_feat_type, trim_silences, calculate_normalisation, normalisation_of_deltas): """Processes wav files in id_list, saves the log-F0 and MVN parameters to files. Args: lab_dir (str): Directory containing the label files. wav_dir (str): Directory containing the wav files. id_list (str): List of file basenames to process. out_dir (str): Directory to save the output to. state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level. question_file (str): Question set to be loaded. Can be one of the four provided question sets; questions-unilex_dnn_600.hed questions-unilex_phones_69.hed questions-radio_dnn_416.hed questions-radio_phones_48.hed questions-mandarin.hed questions-japanese.hed upsample (bool): Whether to upsample phone-level numerical labels to frame-level. subphone_feat_type (str): Subphone features to be extracted from the durations. trim_silences (bool): Whether to trim start and end silences from all features. calculate_normalisation (bool): Whether to automatically calculate MVN parameters after extracting F0. normalisation_of_deltas (bool): Also calculate the MVN parameters for the delta and delta delta features. """ file_ids = utils.get_file_ids(id_list=id_list) question_set = lab_to_feat.QuestionSet(question_file) subphone_feature_set = lab_to_feat.SubphoneFeatureSet(subphone_feat_type) utils.make_dirs(os.path.join(out_dir, 'lab'), file_ids) utils.make_dirs(os.path.join(out_dir, 'counters'), file_ids) utils.make_dirs(os.path.join(out_dir, 'dur'), file_ids) utils.make_dirs(os.path.join(out_dir, 'phones'), file_ids) utils.make_dirs(os.path.join(out_dir, 'n_frames'), file_ids) utils.make_dirs(os.path.join(out_dir, 'n_phones'), file_ids) utils.make_dirs(os.path.join(out_dir, 'lf0'), file_ids) utils.make_dirs(os.path.join(out_dir, 'vuv'), file_ids) utils.make_dirs(os.path.join(out_dir, 'mcep'), file_ids) utils.make_dirs(os.path.join(out_dir, 'bap'), file_ids) for file_id in tqdm(file_ids): # Label processing. lab_path = os.path.join(lab_dir, f'{file_id}.lab') label = lab_to_feat.Label(lab_path, state_level) numerical_labels = label.extract_numerical_labels( question_set, upsample_to_frame_level=upsample) counter_features = label.extract_counter_features(subphone_feature_set) durations = label.phone_durations.reshape((-1, 1)) phones = label.phones n_frames = np.sum(durations).item() n_phones = len(label.phones) # Acoustic processing. wav_path = os.path.join(wav_dir, f'{file_id}.wav') wav, sample_rate = file_io.load_wav(wav_path) f0, vuv, mcep, bap = world_with_reaper_f0.analysis(wav, sample_rate) lf0 = np.log(f0) # Match the number of frames between label forced-alignment and vocoder analysis. # Often the durations from forced alignment are a few frames longer than the vocoder features. diff = n_frames - f0.shape[0] if diff > n_phones: raise ValueError( f'Number of label frames and vocoder frames is too different for {file_id}\n' f'\tlabel frames {n_frames}\n' f'\tvocoder frames {f0.shape[0]}\n' f'\tnumber of phones {n_phones}') # Remove excess durations if there is a shape mismatch. if diff > 0: # Remove 1 frame from each phone's duration starting at the end of the sequence. durations[-diff:] -= 1 n_frames = f0.shape[0] print( f'Cropped {diff} frames from durations for utterance {file_id}' ) assert n_frames == np.sum(durations).item() trim_frame_slice = slice(0, n_frames) if trim_silences: start_phone_idx, end_phone_idx = 0, n_phones start_frame_idx, end_frame_idx = 0, n_frames if phones[0] in ['sil', '#']: start_phone_idx += 1 start_frame_idx += durations[0] if phones[-1] in ['sil', '#']: end_phone_idx -= 1 end_frame_idx -= durations[-1] trim_phone_slice = slice(int(start_phone_idx), int(end_phone_idx)) trim_frame_slice = slice(int(start_frame_idx), int(end_frame_idx)) numerical_labels = numerical_labels[ trim_frame_slice if upsample else trim_phone_slice] durations = durations[trim_phone_slice] phones = phones[trim_phone_slice] n_frames = trim_frame_slice.stop - trim_frame_slice.start n_phones = trim_phone_slice.stop - trim_phone_slice.start counter_features = counter_features[trim_frame_slice] lf0 = lf0[trim_frame_slice] vuv = vuv[trim_frame_slice] mcep = mcep[trim_frame_slice] bap = bap[trim_frame_slice] file_io.save_bin(numerical_labels.astype(np.float32), os.path.join(out_dir, 'lab', f'{file_id}.npy')) file_io.save_bin(counter_features.astype(np.float32), os.path.join(out_dir, 'counters', f'{file_id}.npy')) file_io.save_txt(durations, os.path.join(out_dir, 'dur', f'{file_id}.txt')) file_io.save_lines(phones, os.path.join(out_dir, 'phones', f'{file_id}.txt')) file_io.save_txt(n_frames, os.path.join(out_dir, 'n_frames', f'{file_id}.txt')) file_io.save_txt(n_phones, os.path.join(out_dir, 'n_phones', f'{file_id}.txt')) file_io.save_bin(lf0.astype(np.float32), os.path.join(out_dir, 'lf0', f'{file_id}.npy')) file_io.save_bin(vuv, os.path.join(out_dir, 'vuv', f'{file_id}.npy')) file_io.save_bin(mcep.astype(np.float32), os.path.join(out_dir, 'mcep', f'{file_id}.npy')) file_io.save_bin(bap.astype(np.float32), os.path.join(out_dir, 'bap', f'{file_id}.npy')) if calculate_normalisation: process_minmax(out_dir, 'lab', id_list, out_dir=out_dir) process_minmax(out_dir, 'counters', id_list, out_dir=out_dir) process_mvn(out_dir, 'dur', is_npy=False, id_list=id_list, deltas=False, out_dir=out_dir) process_mvn(out_dir, 'lf0', id_list=id_list, deltas=normalisation_of_deltas, out_dir=out_dir) process_mvn(out_dir, 'mcep', id_list=id_list, deltas=normalisation_of_deltas, out_dir=out_dir) process_mvn(out_dir, 'bap', id_list=id_list, deltas=normalisation_of_deltas, out_dir=out_dir)
def process(lab_dir, id_list, out_dir, state_level, question_file, upsample, subphone_feat_type, calculate_normalisation): """Processes label files in id_list, saves the numerical labels and durations to file. Args: lab_dir (str): Directory containing the label files. id_list (str): List of file basenames to process. out_dir (str): Directory to save the output to. state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level. question_file (str): Question set to be loaded. Can be one of the four provided question sets; questions-unilex_dnn_600.hed questions-unilex_phones_69.hed questions-radio_dnn_416.hed questions-radio_phones_48.hed questions-mandarin.hed questions-japanese.hed upsample (bool): Whether to upsample phone-level numerical labels to frame-level. subphone_feat_type (str): Subphone features to be extracted from the durations. calculate_normalisation (bool): Calculate mean-variance and min-max normalisation for duration and labels. """ file_ids = get_file_ids(id_list=id_list) question_set = QuestionSet(question_file) subphone_feature_set = SubphoneFeatureSet(subphone_feat_type) make_dirs(os.path.join(out_dir, 'lab'), file_ids) make_dirs(os.path.join(out_dir, 'counters'), file_ids) make_dirs(os.path.join(out_dir, 'dur'), file_ids) make_dirs(os.path.join(out_dir, 'phones'), file_ids) make_dirs(os.path.join(out_dir, 'n_frames'), file_ids) make_dirs(os.path.join(out_dir, 'n_phones'), file_ids) for file_id in file_ids: lab_path = os.path.join(lab_dir, f'{file_id}.lab') label = Label(lab_path, state_level) numerical_labels = label.extract_numerical_labels( question_set, upsample_to_frame_level=upsample) counter_features = label.extract_counter_features(subphone_feature_set) durations = label.phone_durations.reshape((-1, 1)) phones = label.phones n_frames = np.sum(durations).item() n_phones = len(label.phones) file_io.save_bin(numerical_labels, os.path.join(out_dir, 'lab', f'{file_id}.npy')) file_io.save_bin(counter_features, os.path.join(out_dir, 'counters', f'{file_id}.npy')) file_io.save_txt(durations, os.path.join(out_dir, 'dur', f'{file_id}.dur')) file_io.save_lines(phones, os.path.join(out_dir, 'phones', f'{file_id}.txt')) file_io.save_txt(n_frames, os.path.join(out_dir, 'n_frames', f'{file_id}.txt')) file_io.save_txt(n_phones, os.path.join(out_dir, 'n_phones', f'{file_id}.txt')) if calculate_normalisation: process_minmax(out_dir, 'lab', id_list) process_minmax(out_dir, 'counters', id_list) process_mvn(out_dir, 'dur', is_npy=False, id_list=id_list, deltas=False)