示例#1
0
def preprocess(data, model, mode):
    if mode == 'lda+lennorm':
        print(mode)
        for grp in data.keys():
            data[grp]['X'] = lennorm(
                (data[grp]['X'] - model['mu_ind']) @ model['lda_matrix'])
        return data

    elif mode == 'ldawccn+lennorm':
        print(mode)
        for grp in data.keys():
            data[grp]['X'] = lennorm(
                (data[grp]['X'] - model['mu_ind']) @ model['lda_wccn_matrix'])
        return data

    elif mode == 'lennorm':
        print(mode)
        for grp in data.keys():
            data[grp]['X'] = lennorm(data[grp]['X']) - model['mu2']
        return data

    elif mode == 'none':
        print(mode)
        return data
    else:
        raise NotImplementedError
示例#2
0
def preprocess(x, model, mode='lennorm', mu_outd=None):
    mu2 = model['mu2'] if mu_outd is None else mu_outd
    if mode == 'lennorm':
        x = lennorm(x) - mu2
    elif mode == 'wccn+lennorm+wccn':
        y = (x - model['mu1']) @ model['wccn_matrix1']
        x = lennorm(y) @ model['wccn_matrix2'] - mu2
    elif mode == 'none':
        # Note that it is wrong because we have to subtract the global mean
        pass
    else:
        raise NotImplementedError
    return x
    def process_traning_data(self, X, spk_ids, X_ind=None, spk_ids_ind=None):
        if self.utts_cutoff or self.norm_cutoff is not None:
            X, spk_ids = self._remove_bad_ivcs(X, spk_ids)
        print(self.prep_mode)

        if self.prep_mode == 'lda+lennorm':
            X = X - X.mean(0)
            prep = PreProcess(X=X, spk_ids=spk_ids, ndim=self.lda_dim).trans_lda()
            self.mu1 = prep.mu
            self.lda_matrix = prep.lda_matrix
            prep.lennorm()
            self.mu2 = prep.mu
            X = prep.X

        elif self.prep_mode == 'ldawccn+lennorm':
            X = X - X.mean(0)
            prep = PreProcess(X=X, spk_ids=spk_ids, ndim=self.lda_dim).trans_lda_wccn()
            self.mu1 = prep.mu
            self.lda_wccn_matrix = prep.lda_wccn_matrix
            prep.lennorm()
            self.mu2 = prep.mu
            X = prep.X

        elif self.prep_mode == 'lennorm':
            X = X - X.mean(0)
            X = lennorm(X)
            self.mu_outd = X.mean(0)            # Global mean of out-of-domain data
            if X_ind is not None:
                X_ind = lennorm(X_ind)
            self.mu2 = X.mean(0) if X_ind is None else X_ind.mean(0)
            X = X - X.mean(0)           # Make sure training vectors for PLDA have zero mean
        elif self.prep_mode == 'none':
            pass
        else:
            raise NotImplementedError
        return X, spk_ids
def ahc_clustering(unlabeled_file, labeled_file, n_clusters=10, display=False):
    """
    Perform agglomerative clustering on the unlabeled file (in .h5 format)
    and produce a labeled file (in .h5 format).
    :param unlabeled_file:
    :param labeled_file:
    :param n_clusters:
    :return:
    """
    with h5.File(unlabeled_file) as f:
        X = f['X'][:]
        n_frames = f['n_frames'][:]
        spk_path = f['spk_path'][:]

    ahc = AgglomerativeClustering(linkage='complete',
                                  n_clusters=n_clusters,
                                  affinity='cosine')
    ahc.fit(lennorm(X))

    os.remove(labeled_file) if os.path.isfile(labeled_file) else None
    unicode = h5.special_dtype(vlen=str)
    with h5.File(labeled_file, 'w') as f:
        f['X'] = X
        f['n_frames'] = n_frames
        f['spk_path'] = spk_path
        spk_ids = []
        for label in ahc.labels_:
            spk_ids.append('spk-' + str(label))
        f['spk_ids'] = np.array(spk_ids, dtype=unicode)

    if display:
        plot_dendrogram(ahc, labels=ahc.labels_)
        plt.show(block=False)
        lbs, lbs_count = np.unique(ahc.labels_, return_counts=True)
        plt.bar(lbs, lbs_count)
        plt.xlabel('Cluster Index')
        plt.ylabel('No. of Samples')
        plt.show()
示例#5
0
    def process_traning_data(self, X, spk_ids, X_ind=None, spk_ids_ind=None):
        if self.utts_cutoff or self.norm_cutoff is not None:
            X, spk_ids = self._remove_bad_ivcs(X, spk_ids)

        print(self.prep_mode)

        # If in-domain data are provided, save the mean and WCCN estimated by in-domain data
        if self.prep_mode == 'lda+lennorm':
            X = X - X.mean(0)
            prep = PreProcess(X=X, spk_ids=spk_ids,
                              ndim=self.lda_dim).trans_lda()
            self.mu1 = prep.mu
            self.lda_matrix = prep.lda_matrix
            prep.lennorm()
            self.mu2 = prep.mu
            X = prep.X

        elif self.prep_mode == 'wccn+lennorm+wccn':
            prep = PreProcess(X=X, spk_ids=spk_ids,
                              ndim=self.lda_dim).trans_wccn()
            self.wccn_matrix1, self.mu1 = prep.wccn_matrix, prep.mu
            prep.lennorm().trans_wccn()
            self.wccn_matrix2, self.mu2 = prep.wccn_matrix, prep.mu
            X = prep.X
            if X_ind is not None:
                prep = PreProcess(X=X_ind,
                                  spk_ids=spk_ids_ind,
                                  ndim=self.lda_dim).trans_wccn()
                self.wccn_matrix1, self.mu1 = prep.wccn_matrix, prep.mu
                prep.lennorm().trans_wccn()
                self.wccn_matrix2, self.mu2 = prep.wccn_matrix, prep.mu

        elif self.prep_mode == 'wccn+lennorm+lda+wccn':
            prep = PreProcess(X=X, spk_ids=spk_ids,
                              ndim=self.lda_dim).trans_wccn()
            self.wccn_matrix1, self.mu1 = prep.wccn_matrix, prep.mu
            prep.lennorm().trans_lda_wccn()
            self.lda_wccn_matrix, self.mu2 = prep.lda_wccn_matrix, prep.mu
            X = prep.demean().X
            if X_ind is not None:
                prep = PreProcess(X=X_ind,
                                  spk_ids=spk_ids_ind,
                                  ndim=self.lda_dim).trans_wccn()
                self.wccn_matrix1, self.mu1 = prep.wccn_matrix, prep.mu
                prep.lennorm().trans_lda_wccn()
                self.lda_wccn_matrix, self.mu2 = prep.lda_wccn_matrix, prep.mu

        elif self.prep_mode == 'lennorm':
            X = lennorm(X)
            self.mu_outd = X.mean(0)  # Global mean of out-of-domain data
            if X_ind is not None:
                X_ind = lennorm(X_ind)
            self.mu2 = X.mean(0) if X_ind is None else X_ind.mean(0)
            X = X - X.mean(
                0)  # Make sure training vectors for PLDA have zero mean

        elif self.prep_mode == 'given_mu2+lennorm':
            prep = PreProcess(X=X, spk_ids=spk_ids)
            X = prep.lennorm().X
            X -= self.mu2

        elif self.prep_mode == 'wccn+lennorm':
            prep = PreProcess(X=X, spk_ids=spk_ids).trans_wccn()
            self.wccn_matrix = prep.wccn_matrix
            self.mu2 = prep.lennorm().mu
            X = prep.demean().X

        elif self.prep_mode == 'whiten+lennorm':
            if X_ind is not None:
                self.mu1 = X_ind.mean(0)
                prep = PreProcess(X=X_ind,
                                  spk_ids=spk_ids).trans_whiten().lennorm()
                self.whiten_matrix = prep.whiten_matrix
                self.mu2 = prep.mu
                X = lennorm((X - self.mu1) @ self.whiten_matrix) - self.mu2
            else:
                self.mu1 = X.mean(
                    0)  # mu1 should be the global mean before whitening
                prep = PreProcess(X=X,
                                  spk_ids=spk_ids).trans_whiten().lennorm()
                self.whiten_matrix = prep.whiten_matrix
                self.mu2 = prep.mu
                X = lennorm((X - self.mu1) @ self.whiten_matrix) - self.mu2

        elif self.prep_mode is None:
            self.mu2 = X.mean(0)
            X -= self.mu2
        elif self.prep_mode == 'none':
            self.mu2 = X.mean(0) if X_ind is None else X_ind.mean(0)
            self.mu_outd = X.mean(0)
            X -= self.mu_outd
        else:
            raise NotImplementedError
        return X, spk_ids