Python GaussianKDE.GaussianKDE примеры использования

Язык программирования: Python

Пространство имен/Пакет: awkde

Класс/Тип: GaussianKDE

Метод/Функция: GaussianKDE

Примеров на hotexamples.com: 3

Python GaussianKDE.GaussianKDE - 3 примера найдено. Это лучшие примеры Python кода для awkde.GaussianKDE.GaussianKDE, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

GaussianKDE(3)

fit(3)

predict(3)

from_json(1)

sample(1)

to_json(1)

Пример #1

Показать файл

def disease_direction(X, y, label='Biomarker', plot_bool=False):
    """
    disease_direction(X,y, [label])
    
    Estimates disease progression direction by comparing empirical CDFs
    in controls and patients
    
    Author: Neil Oxtoby, November 2019
    """
    not_a_number = np.isnan(X) | np.isnan(y)
    not_a_number = not_a_number | (~np.isin(y, [0, 1]))
    y_ = y[~not_a_number]
    X_ = X[~not_a_number]
    sorted_idx = X_.argsort(axis=0)
    kde_values = X_.copy()[sorted_idx].reshape(-1, 1)
    kde_labels = y_.copy()[sorted_idx]
    bin_counts = np.bincount(y_).astype(float)
    mixture = sum(kde_labels == 0) / len(
        kde_labels)  # Prior of being a control
    controls_kde = GaussianKDE(glob_bw="scott", alpha=0.3, diag_cov=False)
    patholog_kde = GaussianKDE(glob_bw="scott", alpha=0.1, diag_cov=False)
    controls_kde.fit(kde_values[kde_labels == 0])
    patholog_kde.fit(kde_values[kde_labels == 1])
    controls_score0 = controls_kde.predict(kde_values)
    patholog_score0 = patholog_kde.predict(kde_values)
    controls_score = controls_score0 * mixture
    patholog_score = patholog_score0 * (1 - mixture)
    ratio = controls_score / (controls_score + patholog_score)
    #* Empirical cumulative distribution: (CDF_patients-CDF_controls) < 0 => disease progression is positive
    cdf_controls = np.cumsum(controls_score) / max(np.cumsum(controls_score))
    cdf_patholog = np.cumsum(patholog_score) / max(np.cumsum(patholog_score))
    cdf_diff = (cdf_patholog - cdf_controls) / (cdf_patholog + cdf_controls)
    disease_dirn = -np.sign(np.nansum(cdf_diff))  #-np.sign(np.mean(cdf_diff))

    if plot_bool:
        f, a = plt.subplots()
        a.plot(kde_values, cdf_controls, label='Controls')
        a.plot(kde_values, cdf_patholog, label='Patients')
        a.legend()
        a.set_title('Disease direction: {0}'.format(disease_dirn))
        a.set_ylabel('Empirical Distribution (CDF)')
        a.set_xlabel(label)
        f.show()

    return disease_dirn

Пример #2

Показать файл

    def fit(self, X, y, implement_fixed_controls=False, patholog_dirn=None):
        #* Requires direction of disease progression as input
        if patholog_dirn is None:
            patholog_dirn = disease_direction(X, y)

        # ####### Diagnostic
        # if patholog_dirn < 0:
        #     print('kde.py DIAGNOSTIC: fit(), Disease progresses with decreasing biomarker values - ')
        # elif patholog_dirn > 0:
        #     print('kde.py DIAGNOSTIC: fit(), Disease progresses with increasing biomarker values + ')
        # else:
        #     print('kde.py DIAGNOSTIC. fit(), ERROR: Disease direction in fit(...,patholog_dirn) must be either positive or negative. \n patholog_dirn = {0]}'.format(patholog_dirn))
        # #######

        sorted_idx = X.argsort(axis=0).flatten()
        kde_values = X.copy()[sorted_idx].reshape(-1, 1)
        kde_labels0 = y.copy()[sorted_idx]
        kde_labels = kde_labels0

        #print('Original labels')
        #print(kde_labels.astype(int))

        bin_counts = np.bincount(y).astype(float)
        mixture0 = sum(kde_labels == 0) / len(
            kde_labels)  # Prior of being a control
        mixture = mixture0
        old_ratios = np.zeros(kde_labels.shape)
        iter_count = 0
        if (self.bandwidth is None):
            #* 1. Rule of thumb
            self.bandwidth = hscott(X)
            # #* 2. Estimate full density to inform variable bandwidth: wide in tails, narrow in peaks
            # all_kde = neighbors.KernelDensity(kernel=self.kernel,
            #                                   bandwidth=self.bandwidth)
            # all_kde.fit(kde_values)
            # f = np.exp(all_kde.score_samples(kde_values))
            # #* 3. Local, a.k.a. variable, bandwidth given by eq. 3 of https://ieeexplore.ieee.org/abstract/document/7761150
            # g = stats.mstats.gmean(f)
            # alpha = 0.5 # sensitivity parameter: 0...1
            # lamb = np.power(f/g,-alpha)
        for i in range(self.n_iters):

            # print('Iteration {0}. kde_labels = {1}'.format(i,[int(k) for k in kde_labels]))

            #* Automatic variable/local bandwidth for each component: awkde package from github
            controls_kde = GaussianKDE(glob_bw="scott",
                                       alpha=self.beta,
                                       diag_cov=False)
            patholog_kde = GaussianKDE(glob_bw="scott",
                                       alpha=self.alpha,
                                       diag_cov=False)
            # controls_kde = GaussianKDE(glob_bw="scott", alpha=0.1, diag_cov=False)
            # patholog_kde = GaussianKDE(glob_bw="scott", alpha=0.1, diag_cov=False)
            controls_kde.fit(kde_values[kde_labels == 0])
            patholog_kde.fit(kde_values[kde_labels == 1])

            controls_score = controls_kde.predict(kde_values)
            patholog_score = patholog_kde.predict(kde_values)

            controls_score = controls_score * mixture
            patholog_score = patholog_score * (1 - mixture)

            ratio = controls_score / (controls_score + patholog_score)

            # print('Iteration {0}. ratio (percent) = {1}'.format(i,[int(r*100) for r in ratio]))

            #* Empirical cumulative distribution: used to swap labels for patients with super-normal values (greater/less than CDF=0.5)
            cdf_controls = np.cumsum(controls_score) / max(
                np.cumsum(controls_score))
            cdf_patholog = np.cumsum(patholog_score) / max(
                np.cumsum(patholog_score))
            cdf_diff = (cdf_patholog - cdf_controls) / (cdf_patholog +
                                                        cdf_controls)
            disease_dirn = -np.sign(np.nansum(
                cdf_diff))  # disease_dirn = -np.sign(np.mean(cdf_diff))
            if disease_dirn > 0:
                cdf_direction = 1 + cdf_diff
            else:
                cdf_direction = -cdf_diff

            #* Identify "normal" biomarkers as being on the healthy side of the controls median => flip patient labels
            if patholog_dirn < 0:
                #* More normal (greater) than half the controls: CDF_controls > 0.5
                labels_forced_normal = cdf_controls > 0.5
                labels_forced_normal_alt = kde_values > np.median(
                    kde_values[kde_labels0 == 0])
            elif patholog_dirn > 0:
                #* More normal (less)    than half the controls: CDF_controls < 0.5
                labels_forced_normal = cdf_controls < 0.5
                labels_forced_normal_alt = kde_values < np.median(
                    kde_values[kde_labels0 == 0])

            #* FIXME: Make this a prior and change the mixture modelling to be Bayesian
            #* First iteration only: implement "prior" that flips healthy-looking patients (before median for controls) to pre-event label
            #* Refit the KDEs at this point
            if i == 0:
                #* Disease direction: force pre-event/healthy-looking patients to flip
                kde_labels[np.where(labels_forced_normal_alt)[0]] = 0
                bin_counts = np.bincount(kde_labels).astype(float)
                mixture = bin_counts[0] / bin_counts.sum()
                #* Refit the KDE components. FIXME: this is copy-and-paste from above. Reimplement in a smarter way.
                controls_kde.fit(kde_values[kde_labels == 0])
                patholog_kde.fit(kde_values[kde_labels == 1])
                controls_score = controls_kde.predict(kde_values)
                patholog_score = patholog_kde.predict(kde_values)
                controls_score = controls_score * mixture
                patholog_score = patholog_score * (1 - mixture)
                ratio = controls_score / (controls_score + patholog_score)
                #* Empirical cumulative distribution: used to swap labels for patients with super-normal values (greater/less than CDF=0.5)
                cdf_controls = np.cumsum(controls_score) / max(
                    np.cumsum(controls_score))
                cdf_patholog = np.cumsum(patholog_score) / max(
                    np.cumsum(patholog_score))
                cdf_diff = (cdf_patholog - cdf_controls) / (cdf_patholog +
                                                            cdf_controls)
                disease_dirn = -np.sign(np.nansum(
                    cdf_diff))  # disease_dirn = -np.sign(np.mean(cdf_diff))
                if disease_dirn > 0:
                    cdf_direction = 1 + cdf_diff
                    # print('Disease direction is estimated to be POSTIIVE')
                else:
                    cdf_direction = -cdf_diff
                    # print('Disease direction is estimated to be NEGATIVE')
                #* Identify "normal" biomarkers as being on the healthy side of the controls median => flip patient labels
                if patholog_dirn < 0:
                    #* More normal (greater) than half the controls: CDF_controls > 0.5
                    labels_forced_normal = cdf_controls > 0.5
                    labels_forced_normal_alt = kde_values > np.median(
                        kde_values[kde_labels0 == 0])
                elif patholog_dirn > 0:
                    #* More normal (less)    than half the controls: CDF_controls < 0.5
                    labels_forced_normal = cdf_controls < 0.5
                    labels_forced_normal_alt = kde_values < np.median(
                        kde_values[kde_labels0 == 0])

            if (np.all(ratio == old_ratios)):
                # print('MM finished in {0} iterations'.format(iter_count))
                break
            iter_count += 1
            old_ratios = ratio
            kde_labels = ratio < 0.5

            #* Labels to swap:
            diff_y = np.hstack(
                ([0], np.diff(kde_labels)))  # !=0 where adjacent labels differ

            if ((np.sum(diff_y != 0) >= 2) &
                (np.unique(kde_labels).shape[0] == 2)):
                split_y = int(
                    np.all(np.diff(np.where(kde_labels == 0)) == 1)
                )  # kde_label upon which to split: 1 if all 0s are adjacent, 0 otherwise
                sizes = [
                    x.shape[0] for x in np.split(diff_y,
                                                 np.where(diff_y != 0)[0])
                ]  # lengths of each contiguous set of labels

                #* Identify which labels to swap using direction of abnormality: avg(controls) vs avg(patients)
                #* N ote that this is now like k-medians clustering, rather than k-means
                split_prior_smaller = (np.median(
                    kde_values[kde_labels == split_y]) < np.median(
                        kde_values[kde_labels == (split_y + 1) % 2]))
                if split_prior_smaller:
                    replace_idxs = np.arange(kde_values.shape[0])[
                        -sizes[2]:]  # greater values are swapped
                else:
                    replace_idxs = np.arange(
                        kde_values.shape[0]
                    )[:sizes[0]]  # lesser values are swapped
                kde_labels[replace_idxs] = (split_y + 1) % 2  # swaps labels

            #* Disease direction: force pre-event/healthy-looking patients to flip
            kde_labels[np.where(labels_forced_normal_alt)[0]] = 0

            #*** Prevent label swapping for "strong controls"
            fixed_controls_criteria_0 = (kde_labels0 == 0)  # Controls
            # #*** CDF criteria - do not delete: potentially also used for disease direction
            # en = 10
            # cdf_threshold = (en-1)/(en+1) # cdf(p) = en*(1-cdf(c)), i.e., en-times more patients than remaining controls
            # controls_tail = cdf_direction > (cdf_threshold * max(cdf_direction))
            # #fixed_controls_criteria_0 = fixed_controls_criteria_0 & (~controls_tail)
            # #*** PDF ratio criteria
            # ratio_threshold_strong_controls = 0.33 # P(control) / [P(control) + P(patient)]
            # fixed_controls_criteria = fixed_controls_criteria & (ratio > ratio_threshold_strong_controls) # "Strong controls"
            #*** Outlier criteria for weak (e.g., low-performing on test; or potentially prodromal in sporadic disease) controls: quantiles
            q = 0.90  # x-tiles
            if disease_dirn > 0:
                q = q  # upper
                f = np.greater
                g = np.less
                # print('Disease direction: positive')
            else:
                q = 1 - q  # lower
                f = np.less
                g = np.greater
                # print('Disease direction: negative')
            extreme_cases = f(kde_values,
                              np.quantile(kde_values,
                                          q)).reshape(-1,
                                                      1)  #& (kde_labels0==0)
            fixed_controls_criteria = fixed_controls_criteria_0.reshape(
                -1, 1) & ~(extreme_cases)
            if implement_fixed_controls:
                kde_labels[np.where(fixed_controls_criteria)[0]] = 0
                #kde_labels[np.where(controls_outliers)[0]] = 1 # Flip outlier controls

            bin_counts = np.bincount(kde_labels).astype(float)
            mixture = bin_counts[0] / bin_counts.sum()
            if (mixture < 0.10 or mixture >
                    0.90):  # if(mixture < (0.90*mixture0) or mixture > 0.90):
                # print('MM finished (mixture weight too low/high) in {0} iterations'.format(iter_count))
                break
        self.controls_kde = controls_kde
        self.patholog_kde = patholog_kde
        self.mixture = mixture
        self.iter_ = iter_count
        return self

Пример #3

Показать файл

# a^2 * x * exp(-a * x)
a = 100.

n_samples = 1000
logE_sam = rndgen.normal(mean, sigma, size=n_samples)

# From pythia8: home.thep.lu.se/~torbjorn/doxygen/Basics_8h_source.html
u1, u2 = rndgen.uniform(size=(2, n_samples))
sigma_sam = -np.log(u1 * u2) / a

# Shape must be (n_points, n_features)
sample = np.vstack((logE_sam, sigma_sam)).T

# Create KDE and fit it. Save model in JSON format
print("Fitting model to {} sample points.".format(n_samples))
kde = GaussianKDE(glob_bw="silverman", alpha=0.5, diag_cov=True)
kde.fit(sample)

# Save and load the model
outf = "./example_KDE.json"
print("Saving model to {}".format(outf))
kde.to_json(outf)
print("Loading same model from {}".format(outf))
kde = GaussianKDE.from_json(outf)

# Evaluate at dense grid
minx, maxx = np.amin(sample[:, 0]), np.amax(sample[:, 0])
miny, maxy = np.amin(sample[:, 1]), np.amax(sample[:, 1])

x = np.linspace(minx, maxx, 100)
y = np.linspace(miny, maxy, 100)