示例#1
0
def process(lab_dir, id_list, out_dir, state_level):
    """Processes label files in id_list, saves the phone identities (as a string) to text files.

    Args:
        lab_dir (str): Directory containing the label files.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
        state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level.
    """
    file_ids = utils.get_file_ids(id_list=id_list)

    utils.make_dirs(os.path.join(out_dir, 'phones'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'n_phones'), file_ids)

    for file_id in file_ids:
        # Label processing.
        lab_path = os.path.join(lab_dir, f'{file_id}.lab')
        label = lab_to_feat.Label(lab_path, state_level)

        phones = label.phones
        n_phones = len(label.phones)

        file_io.save_lines(phones,
                           os.path.join(out_dir, 'phones', f'{file_id}.txt'))
        file_io.save_txt(n_phones,
                         os.path.join(out_dir, 'n_phones', f'{file_id}.txt'))
示例#2
0
    def save_file(self, data, base_name, data_dir):
        r"""Saves data as a text file.

        Parameters
        ----------
        data : int or float or bool or `np.ndarray`, shape (seq_len, feat_dim)
            Data loaded from the file specified.
        base_name : str
            The name (without extensions) of the file to be loaded.
        data_dir : str
            The directory containing all feature types for this dataset.
        """
        file_path = self.file_path(base_name, data_dir)
        file_io.save_txt(data, file_path)
示例#3
0
def cluster(embeddings, n_clusters, names=None, out_dir=None):
    """Processes wav files in id_list, saves the log-F0 and MVN parameters to files.

    Args:
        embeddings_dir (str): Directory containing the embedding files.
        n_clusters (int): Number of clusters for k-means.
        names (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
    """
    if out_dir is not None:
        if names is None:
            raise ValueError(
                'If `out_dir` is given, then `names` of individual sentences must also be given'
            )

        centres_path = os.path.join(out_dir, 'k_means', 'clusters')
        make_dirs(centres_path, names)

        assignments_path = os.path.join(out_dir, 'k_means',
                                        'cluster_assignments')
        make_dirs(assignments_path, names)

    # Cluster with k-means.
    kmeans = KMeans(n_clusters=n_clusters).fit(embeddings)
    cluster_centres = kmeans.cluster_centers_
    cluster_assignments = kmeans.labels_

    # Save the cluster assignments and clusters to files.
    if out_dir is not None:
        cluster_names = [f'cluster_{i}' for i in range(n_clusters)]
        file_io.save_dir(file_io.save_bin,
                         centres_path,
                         cluster_centres,
                         cluster_names,
                         feat_ext='npy')
        file_io.save_dir(file_io.save_txt,
                         assignments_path,
                         cluster_assignments,
                         names,
                         feat_ext='txt')

        counts = np.array([(i,
                            cluster_assignments.reshape(-1).tolist().count(i))
                           for i in range(n_clusters)])
        file_io.save_txt(counts, f'{assignments_path}_counts.txt')

    return cluster_centres, cluster_assignments
示例#4
0
    def analysis_for_train_epoch(self, out_dir, **kwargs):
        pred_dir = os.path.join(out_dir, 'feats')
        os.makedirs(pred_dir, exist_ok=True)

        # Get pseudo inputs and calculate prior using the encoder.
        prior_mean, prior_log_variance = self.encoder_layer(
            self.pseudo_inputs, seq_len=self.pseudo_inputs_seq_lens)

        prior_mean = prior_mean.cpu().detach().numpy()
        prior_log_variance = prior_log_variance.cpu().detach().numpy()

        file_io.save_dir(file_io.save_bin,
                         path=os.path.join(pred_dir, 'prior'),
                         data=prior_mean,
                         file_ids=self.pseudo_input_names)

        embeddings = self.metrics.metrics['embeddings'].result().detach().cpu(
        ).numpy()
        names = self.metrics.metrics['name'].result()

        # Names and classes are at a sentence level, change these to segment level for use in the scatter plot.
        n_segments = self.metrics.metrics['n_segments'].result().detach().cpu(
        ).numpy().squeeze(1)
        segment_names = [
            f'{names[i]}_{j}' for i, n_segment in enumerate(n_segments)
            for j in range(n_segment)
        ]

        segment_mean_F0 = self.metrics.metrics['segment_mean_F0'].result(
        ).detach().cpu().numpy().squeeze(1)

        title = out_dir.split('experiments/')[-1]
        for proj in ['PCA', 'tSNE']:
            viz.scatter_plot(embeddings,
                             segment_names,
                             prior_mean,
                             self.pseudo_input_names,
                             gradients=segment_mean_F0,
                             gradient_title='Mean phrase F0 (Hz)',
                             projection=proj,
                             title=title,
                             out_file=os.path.join(
                                 out_dir, f'scatter_{proj}_mean_F0.pdf'))

        def get_class_assignments(z, mean, log_variance):
            densities = np.sum(-0.5 * (log_variance +
                                       (z - mean)**2 / np.exp(log_variance)),
                               axis=-1)
            return np.argmax(densities, axis=-1)

        posterior_classes = get_class_assignments(
            embeddings[:, None, :], prior_mean[None, :, :],
            prior_log_variance[None, :, :])

        file_io.save_dir(file_io.save_txt,
                         os.path.join(pred_dir, 'classes'),
                         posterior_classes,
                         segment_names,
                         feat_ext='txt')

        counts = np.array([(i, posterior_classes.reshape(-1).tolist().count(i))
                           for i in range(self.n_components)])
        file_io.save_txt(counts, os.path.join(pred_dir, 'class_counts.txt'))
def process(lab_dir, wav_dir, id_list, out_dir, state_level, question_file,
            upsample, subphone_feat_type, trim_silences,
            calculate_normalisation, normalisation_of_deltas):
    """Processes wav files in id_list, saves the log-F0 and MVN parameters to files.

    Args:
        lab_dir (str): Directory containing the label files.
        wav_dir (str): Directory containing the wav files.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
        state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level.
        question_file (str): Question set to be loaded. Can be one of the four provided question sets;
                questions-unilex_dnn_600.hed
                questions-unilex_phones_69.hed
                questions-radio_dnn_416.hed
                questions-radio_phones_48.hed
                questions-mandarin.hed
                questions-japanese.hed
        upsample (bool): Whether to upsample phone-level numerical labels to frame-level.
        subphone_feat_type (str): Subphone features to be extracted from the durations.
        trim_silences (bool): Whether to trim start and end silences from all features.
        calculate_normalisation (bool): Whether to automatically calculate MVN parameters after extracting F0.
        normalisation_of_deltas (bool): Also calculate the MVN parameters for the delta and delta delta features.
    """
    file_ids = utils.get_file_ids(id_list=id_list)

    question_set = lab_to_feat.QuestionSet(question_file)
    subphone_feature_set = lab_to_feat.SubphoneFeatureSet(subphone_feat_type)

    utils.make_dirs(os.path.join(out_dir, 'lab'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'counters'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'dur'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'phones'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'n_frames'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'n_phones'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'lf0'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'vuv'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'mcep'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'bap'), file_ids)

    for file_id in tqdm(file_ids):
        # Label processing.
        lab_path = os.path.join(lab_dir, f'{file_id}.lab')
        label = lab_to_feat.Label(lab_path, state_level)

        numerical_labels = label.extract_numerical_labels(
            question_set, upsample_to_frame_level=upsample)
        counter_features = label.extract_counter_features(subphone_feature_set)
        durations = label.phone_durations.reshape((-1, 1))
        phones = label.phones

        n_frames = np.sum(durations).item()
        n_phones = len(label.phones)

        # Acoustic processing.
        wav_path = os.path.join(wav_dir, f'{file_id}.wav')
        wav, sample_rate = file_io.load_wav(wav_path)

        f0, vuv, mcep, bap = world_with_reaper_f0.analysis(wav, sample_rate)
        lf0 = np.log(f0)

        # Match the number of frames between label forced-alignment and vocoder analysis.
        # Often the durations from forced alignment are a few frames longer than the vocoder features.
        diff = n_frames - f0.shape[0]
        if diff > n_phones:
            raise ValueError(
                f'Number of label frames and vocoder frames is too different for {file_id}\n'
                f'\tlabel frames {n_frames}\n'
                f'\tvocoder frames {f0.shape[0]}\n'
                f'\tnumber of phones {n_phones}')

        # Remove excess durations if there is a shape mismatch.
        if diff > 0:
            # Remove 1 frame from each phone's duration starting at the end of the sequence.
            durations[-diff:] -= 1
            n_frames = f0.shape[0]
            print(
                f'Cropped {diff} frames from durations for utterance {file_id}'
            )

        assert n_frames == np.sum(durations).item()

        trim_frame_slice = slice(0, n_frames)
        if trim_silences:

            start_phone_idx, end_phone_idx = 0, n_phones
            start_frame_idx, end_frame_idx = 0, n_frames
            if phones[0] in ['sil', '#']:
                start_phone_idx += 1
                start_frame_idx += durations[0]
            if phones[-1] in ['sil', '#']:
                end_phone_idx -= 1
                end_frame_idx -= durations[-1]

            trim_phone_slice = slice(int(start_phone_idx), int(end_phone_idx))
            trim_frame_slice = slice(int(start_frame_idx), int(end_frame_idx))

            numerical_labels = numerical_labels[
                trim_frame_slice if upsample else trim_phone_slice]
            durations = durations[trim_phone_slice]
            phones = phones[trim_phone_slice]

            n_frames = trim_frame_slice.stop - trim_frame_slice.start
            n_phones = trim_phone_slice.stop - trim_phone_slice.start

        counter_features = counter_features[trim_frame_slice]
        lf0 = lf0[trim_frame_slice]
        vuv = vuv[trim_frame_slice]
        mcep = mcep[trim_frame_slice]
        bap = bap[trim_frame_slice]

        file_io.save_bin(numerical_labels.astype(np.float32),
                         os.path.join(out_dir, 'lab', f'{file_id}.npy'))
        file_io.save_bin(counter_features.astype(np.float32),
                         os.path.join(out_dir, 'counters', f'{file_id}.npy'))
        file_io.save_txt(durations,
                         os.path.join(out_dir, 'dur', f'{file_id}.txt'))
        file_io.save_lines(phones,
                           os.path.join(out_dir, 'phones', f'{file_id}.txt'))

        file_io.save_txt(n_frames,
                         os.path.join(out_dir, 'n_frames', f'{file_id}.txt'))
        file_io.save_txt(n_phones,
                         os.path.join(out_dir, 'n_phones', f'{file_id}.txt'))

        file_io.save_bin(lf0.astype(np.float32),
                         os.path.join(out_dir, 'lf0', f'{file_id}.npy'))
        file_io.save_bin(vuv, os.path.join(out_dir, 'vuv', f'{file_id}.npy'))
        file_io.save_bin(mcep.astype(np.float32),
                         os.path.join(out_dir, 'mcep', f'{file_id}.npy'))
        file_io.save_bin(bap.astype(np.float32),
                         os.path.join(out_dir, 'bap', f'{file_id}.npy'))

    if calculate_normalisation:
        process_minmax(out_dir, 'lab', id_list, out_dir=out_dir)
        process_minmax(out_dir, 'counters', id_list, out_dir=out_dir)
        process_mvn(out_dir,
                    'dur',
                    is_npy=False,
                    id_list=id_list,
                    deltas=False,
                    out_dir=out_dir)

        process_mvn(out_dir,
                    'lf0',
                    id_list=id_list,
                    deltas=normalisation_of_deltas,
                    out_dir=out_dir)
        process_mvn(out_dir,
                    'mcep',
                    id_list=id_list,
                    deltas=normalisation_of_deltas,
                    out_dir=out_dir)
        process_mvn(out_dir,
                    'bap',
                    id_list=id_list,
                    deltas=normalisation_of_deltas,
                    out_dir=out_dir)
示例#6
0
def process(lab_dir, id_list, out_dir, state_level, question_file, upsample,
            subphone_feat_type, calculate_normalisation):
    """Processes label files in id_list, saves the numerical labels and durations to file.

    Args:
        lab_dir (str): Directory containing the label files.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
        state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level.
        question_file (str): Question set to be loaded. Can be one of the four provided question sets;
                questions-unilex_dnn_600.hed
                questions-unilex_phones_69.hed
                questions-radio_dnn_416.hed
                questions-radio_phones_48.hed
                questions-mandarin.hed
                questions-japanese.hed
        upsample (bool): Whether to upsample phone-level numerical labels to frame-level.
        subphone_feat_type (str): Subphone features to be extracted from the durations.
        calculate_normalisation (bool): Calculate mean-variance and min-max normalisation for duration and labels.
    """
    file_ids = get_file_ids(id_list=id_list)
    question_set = QuestionSet(question_file)
    subphone_feature_set = SubphoneFeatureSet(subphone_feat_type)

    make_dirs(os.path.join(out_dir, 'lab'), file_ids)
    make_dirs(os.path.join(out_dir, 'counters'), file_ids)
    make_dirs(os.path.join(out_dir, 'dur'), file_ids)
    make_dirs(os.path.join(out_dir, 'phones'), file_ids)
    make_dirs(os.path.join(out_dir, 'n_frames'), file_ids)
    make_dirs(os.path.join(out_dir, 'n_phones'), file_ids)

    for file_id in file_ids:
        lab_path = os.path.join(lab_dir, f'{file_id}.lab')
        label = Label(lab_path, state_level)

        numerical_labels = label.extract_numerical_labels(
            question_set, upsample_to_frame_level=upsample)
        counter_features = label.extract_counter_features(subphone_feature_set)
        durations = label.phone_durations.reshape((-1, 1))
        phones = label.phones

        n_frames = np.sum(durations).item()
        n_phones = len(label.phones)

        file_io.save_bin(numerical_labels,
                         os.path.join(out_dir, 'lab', f'{file_id}.npy'))
        file_io.save_bin(counter_features,
                         os.path.join(out_dir, 'counters', f'{file_id}.npy'))
        file_io.save_txt(durations,
                         os.path.join(out_dir, 'dur', f'{file_id}.dur'))
        file_io.save_lines(phones,
                           os.path.join(out_dir, 'phones', f'{file_id}.txt'))

        file_io.save_txt(n_frames,
                         os.path.join(out_dir, 'n_frames', f'{file_id}.txt'))
        file_io.save_txt(n_phones,
                         os.path.join(out_dir, 'n_phones', f'{file_id}.txt'))

    if calculate_normalisation:
        process_minmax(out_dir, 'lab', id_list)
        process_minmax(out_dir, 'counters', id_list)
        process_mvn(out_dir,
                    'dur',
                    is_npy=False,
                    id_list=id_list,
                    deltas=False)
def process(lab_dir, id_list, out_dir, state_level, lab_dir_with_pos, wav_dir):
    """Processes label files in id_list, saves the phone identities (as a string) to text files.

    Args:
        lab_dir (str): Directory containing the label files.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
        state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level.
    """
    file_ids = utils.get_file_ids(id_list=id_list)

    utils.make_dirs(os.path.join(out_dir, 'segment_n_phones'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'segment_n_frames'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'n_segments'), file_ids)

    for file_id in file_ids:
        lab_path_with_pos = os.path.join(lab_dir_with_pos, f'{file_id}.lab')
        label_with_pos = file_io.load_lines(lab_path_with_pos)

        word_start_idxs, _ = get_word_idxs(
            label_with_pos, word_idx_sep=(r'@', r'\+'), phrase_idx_sep=(r'@', r'='))
        pos_tags = get_pos_tags(label_with_pos, word_start_idxs)

        lab_path = os.path.join(lab_dir, f'{file_id}.lab')
        label = lab_to_feat.Label(lab_path, state_level)

        durations = label.phone_durations
        n_frames = np.sum(durations).item()
        n_phones = len(label.phones)

        word_start_idxs, word_end_idxs = get_word_idxs(
            label.labels, word_idx_sep=(r':', r'\+'), phrase_idx_sep=(r':', r'='))
        try:
            segment_start_idxs, segment_end_idxs = segment_words(word_start_idxs, word_end_idxs, pos_tags)
        except (ValueError, IndexError) as e:
            print(f'{e}\n{file_id}')
        else:
            wav_path = os.path.join(wav_dir, f'{file_id}.wav')
            wav, sample_rate = file_io.load_wav(wav_path)
            f0, _, _, _ = world_with_reaper_f0.analysis(wav, sample_rate)

            # Match the number of frames between label forced-alignment and vocoder analysis.
            # Often the durations from forced alignment are a few frames longer than the vocoder features.
            diff = n_frames - f0.shape[0]
            if diff > n_phones:
                raise ValueError(f'Number of label frames and vocoder frames is too different for {file_id}\n'
                                 f'\tlabel frames {n_frames}\n'
                                 f'\tvocoder frames {f0.shape[0]}\n'
                                 f'\tnumber of phones {n_phones}')

            # Remove excess durations if there is a shape mismatch.
            if diff > 0:
                # Remove 1 frame from each phone's duration starting at the end of the sequence.
                durations[-diff:] -= 1
                n_frames = f0.shape[0]
                print(f'Cropped {diff} frames from durations for utterance {file_id}')

            assert n_frames == np.sum(durations).item()

            segment_phone_lens = []
            segment_frame_lens = []
            for segment_start_idx, segment_end_idx in zip(segment_start_idxs, segment_end_idxs):
                segment_phone_lens.append(segment_end_idx - segment_start_idx)
                segment_frame_lens.append(sum(durations[segment_start_idx:segment_end_idx]))

            file_io.save_txt(segment_phone_lens, os.path.join(out_dir, 'segment_n_phones', f'{file_id}.txt'))
            file_io.save_txt(segment_frame_lens, os.path.join(out_dir, 'segment_n_frames', f'{file_id}.txt'))
            file_io.save_txt(len(segment_phone_lens), os.path.join(out_dir, 'n_segments', f'{file_id}.txt'))