Пример #1
0
def get_kd(model, X_train, Y_train, X_test, X_test_noisy, X_test_adv):
    """
    Get kernel density scores
    :param model: 
    :param X_train: 
    :param Y_train: 
    :param X_test: 
    :param X_test_noisy: 
    :param X_test_adv: 
    :return: artifacts: positive and negative examples with kd values, 
            labels: adversarial (label: 1) and normal/noisy (label: 0) examples
    """
    # Get deep feature representations
    print('Getting deep feature representations...')
    X_train_features = get_deep_representations(model,
                                                X_train,
                                                batch_size=args.batch_size)
    X_test_normal_features = get_deep_representations(
        model, X_test, batch_size=args.batch_size)
    X_test_noisy_features = get_deep_representations(
        model, X_test_noisy, batch_size=args.batch_size)
    X_test_adv_features = get_deep_representations(model,
                                                   X_test_adv,
                                                   batch_size=args.batch_size)
    # Train one KDE per class
    print('Training KDEs...')
    class_inds = {}
    for i in range(Y_train.shape[1]):
        class_inds[i] = np.where(Y_train.argmax(axis=1) == i)[0]
    kdes = {}
    warnings.warn(
        "Using pre-set kernel bandwidths that were determined "
        "optimal for the specific CNN models of the paper. If you've "
        "changed your model, you'll need to re-optimize the "
        "bandwidth.")
    print('bandwidth %.4f for %s' % (BANDWIDTHS[args.dataset], args.dataset))
    for i in range(Y_train.shape[1]):
        kdes[i] = KernelDensity(kernel='gaussian',
                                bandwidth=BANDWIDTHS[args.dataset]) \
            .fit(X_train_features[class_inds[i]])
    # Get model predictions
    print('Computing model predictions...')
    preds_test_normal = model.predict_classes(X_test,
                                              verbose=0,
                                              batch_size=args.batch_size)
    preds_test_noisy = model.predict_classes(X_test_noisy,
                                             verbose=0,
                                             batch_size=args.batch_size)
    preds_test_adv = model.predict_classes(X_test_adv,
                                           verbose=0,
                                           batch_size=args.batch_size)
    # Get density estimates
    print('computing densities...')
    densities_normal = score_samples(kdes, X_test_normal_features,
                                     preds_test_normal)
    densities_noisy = score_samples(kdes, X_test_noisy_features,
                                    preds_test_noisy)
    densities_adv = score_samples(kdes, X_test_adv_features, preds_test_adv)

    print("densities_normal:", densities_normal.shape)
    print("densities_adv:", densities_adv.shape)
    print("densities_noisy:", densities_noisy.shape)

    ## skip the normalization, you may want to try different normalizations later
    ## so at this step, just save the raw values
    # densities_normal_z, densities_adv_z, densities_noisy_z = normalize(
    #     densities_normal,
    #     densities_adv,
    #     densities_noisy
    # )

    densities_pos = densities_adv
    densities_neg = np.concatenate((densities_normal, densities_noisy))
    artifacts, labels = merge_and_generate_labels(densities_pos, densities_neg)

    return artifacts, labels
Пример #2
0
dist = np.sqrt(
    np.sum(np.square(y_reconstructed - test_latents).reshape(
        len(test_latents), -1),
           axis=1))
sns.distplot(dist)
pred_save(dist, PRED_FOLDER + 'prediction_unet_vae_pca_reconstruced.csv')

# %%
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
print('TSNE fitting...')
tsne = TSNE(n_components=2, random_state=SEED, verbose=True)
y_TSNE = tsne.fit_transform(test_latents)
plt.scatter(y_TSNE[:, 0], y_TSNE[:, 1], s=1)

rmse_tsne_test = np.sqrt(
    np.square(y_TSNE[:, 0] - np.mean(y_TSNE[:, 0])) +
    np.square(y_TSNE[:, 1] - np.mean(y_TSNE[:, 1])))
sns.distplot(rmse_tsne_test)
pred_save(rmse_tsne_test, PRED_FOLDER + 'prediction_unet_vae_tsne_rmse.csv')
# %%
from sklearn.neighbors import KernelDensity
kd = KernelDensity()
kd.fit(test_latents)
score = [kd.score(i.reshape(1, -1)) for i in test_latents]
score = score - np.min(score)
sns.distplot(score)
pred_save(score, PRED_FOLDER + 'prediction_unet_vae_latentkd.csv')

# %%
Пример #3
0
    def one_cut(self, x):

        #~ x = x[x>(thresh-threshold_margin)]
        #~ kde = scipy.stats.gaussian_kde(x, bw_method=kde_bandwith)
        #~ d = kde(bins)
        #~ d /= np.sum(d)
        
        kde = KernelDensity(kernel='gaussian', bandwidth=self.kde_bandwith)
        d = kde.fit(x[:, np.newaxis]).score_samples(self.bins[:, np.newaxis])
        d = np.exp(d)



        #local max
        d0, d1, d2 = d[:-2], d[1:-1], d[2:]
        #~ ind_max,  = np.nonzero((d0<d1) & (d2<d1))
        ind_max,  = np.nonzero((d0<d1) & (d2<=d1))
        ind_max += 1
        #~ ind_min,  = np.nonzero((d0>d1) & (d2>d1))
        ind_min,  = np.nonzero((d0>d1) & (d2>=d1))
        ind_min += 1
        
        #~ print('ind_max', ind_max)
        #~ print('ind_min', ind_min)
        #~ fig, ax = plt.subplots()
        #~ ax.plot(d)
        #~ ax.plot(ind_max, d[ind_max], ls='None', marker='o', color='r')
        #~ ax.plot(ind_min, d[ind_min], ls='None', marker='o', color='g')
        #~ plt.show()

        if ind_max.size>0:
            if ind_min.size==0:
                assert ind_max.size==1, 'Super louche pas de min mais plusieur max'
                ind_min = np.array([0, self.bins.size-1], dtype='int64')
            else:
                ind_min = ind_min.tolist()
                if ind_max[0]<ind_min[0]:
                    ind_min = [0] + ind_min
                if ind_max[-1]>ind_min[-1]:
                    ind_min = ind_min + [ self.bins.size-1]
                ind_min = np.array(ind_min, dtype='int64')
        
        
        #Loop reject small rebounce minimam/maxima
        #~ print('loop1')
        ind_max_cleaned = ind_max.tolist()
        ind_min_cleaned = ind_min.tolist()
        while True:
            rejected_minima = None
            rejected_maxima = None
            #~ print('ind_min_cleaned', ind_min_cleaned, self.bins[ind_min_cleaned])
            #~ print('ind_max_cleaned', ind_max_cleaned, self.bins[ind_max_cleaned])
            for i, ind in enumerate(ind_min_cleaned[1:-1]):
                prev_max = ind_max_cleaned[i]
                next_max = ind_max_cleaned[i+1]
                
                delta_density_prev = d[prev_max] - d[ind]
                delta_density_next = d[next_max] - d[ind]
                
                if min(delta_density_prev, delta_density_next)<d[ind]*self.minima_rejection_factor:
                    rejected_minima = ind
                    if delta_density_prev<delta_density_next:
                        rejected_maxima = prev_max
                    else:
                        rejected_maxima = next_max
                    break
            
            if rejected_minima is None:
                break
            
            ind_max_cleaned.remove(rejected_maxima)
            ind_min_cleaned.remove(rejected_minima)
        
        #~ print('loop2')
        #loop reject density with too few spikes
        while True:
            rejected_minima = None
            rejected_maxima = None
            
            #~ print('ind_min_cleaned', ind_min_cleaned, self.bins[ind_min_cleaned])
            #~ print('ind_max_cleaned', ind_max_cleaned, self.bins[ind_max_cleaned])
            
            for i, ind in enumerate(ind_min_cleaned[:-1]):
                next_min = ind_min_cleaned[i+1]
                n = np.sum(d[ind:next_min]*self.binsize) * x.size
                #~ print('n', n, self.bins[ind], self.bins[next_min], np.sum(d))
                if n<self.nb_min:
                    rejected_maxima = ind_max_cleaned[i]
                    if d[ind]<d[next_min]:
                        rejected_minima = next_min
                    else:
                        rejected_minima = ind
                    break
            
            if rejected_minima is None:
                break
            
            ind_max_cleaned.remove(rejected_maxima)
            ind_min_cleaned.remove(rejected_minima)
            

        #~ print('loop3')
        #TODO eliminate first avec meme critere loop 1
        if len(ind_min_cleaned)>=2:
            den_min0 = d[ind_min_cleaned[0]]
            den_max0 = d[ind_max_cleaned[0]]
            if (den_max0-den_min0)<den_min0*self.minima_rejection_factor:
                ind_min_cleaned = ind_min_cleaned[1:]
                ind_max_cleaned = ind_max_cleaned[1:]
        
        #~ print('loop4')
        if len(ind_min_cleaned)>=2:
            if self.bins[ind_max_cleaned[0]]<self.threshold+self.margin_first_max:
                ind_min_cleaned = ind_min_cleaned[1:]
                ind_max_cleaned = ind_max_cleaned[1:]
        
        
        if len(ind_min_cleaned)>=2:
            #TODO here criterium for best
            return self.bins[ind_min_cleaned[-2]], self.bins[ind_min_cleaned[-1]], d
        else:
            return None, None, d
Пример #4
0
        r11_high = np.max(line_X11[lindx])
    else:
        r11_low = -1.0
        r11_high = -1.0
    if i == 0:
        r11_low0 = r11_low
        r11_high0 = r11_high
    print(' lz region for 1:1 =', r11_low, r11_high)
    ax[i].add_patch(
      patches.Rectangle((r11_low, ymin_hist), r11_high-r11_low, \
                        ymax_hist-ymin_hist, facecolor='grey', fill=True, alpha=0.5))

    # histogram
    # ax[i].hist(lzs, bins = lz_bins, fc='#AAAAFF', density=True)
    # KDE
    kde = KernelDensity(kernel='epanechnikov', \
                        bandwidth=hlz).fit(lzs.reshape(-1, 1), sample_weight=probs)
    log_dens = kde.score_samples(lz_bins.reshape(-1, 1))
    ax[i].plot(lz_bins, np.exp(log_dens), color='black')

    # set tick params
    ax[i].tick_params(labelsize=16, color='k', direction="in")
    ax[i].set_xlim(lzmin_hist, lzmax_hist)
    ax[i].set_ylim(ymin_hist, ymax_hist)
    if i == njrsamp - 1:
        ax[i].set_xticks([0.5, 1.0, 1.5])
    if i == 0:
        ax[i].set_ylabel(r"dN($0.03<{\rm J}_{\rm R}<0.1$)", fontsize=14)
    if i == 1:
        ax[i].set_ylabel(r"dN($0.01<{\rm J}_{\rm R}<0.02$)", fontsize=14)
        ax[i].set_xticks([0.5, 1.0, 1.5])
Пример #5
0
    n_features = [15,25,30]

    
    print('===============================================================================================')


    for i in range(len(n_features)):
        
        #-------------------------Reducing the number of dimentions using PCA---------------------
        pca = PCA(n_components=n_features[i], whiten=False)
        data = pca.fit_transform(digits.data)

        #-------------Performing Grid Search Cross-Validation to optimize the bandwidth-----------
        print('Performing Grid Search Cross-Validation to optimize the bandwidth')
        params = {'bandwidth': np.logspace(-1, 1, 20)}
        grid = GridSearchCV(KernelDensity(), params, cv=5)
        grid.fit(data)

        print('Best Bandwidth using {} features: {}'.format(n_features[i],grid.best_estimator_.bandwidth))

        #--------------------------Perform KDE using this best Bandwidth--------------------------
        kde = grid.best_estimator_

        # ---------------------Sample 48 new data points from estimated density-------------------
        new_data = kde.sample(48, random_state=0)
        new_data = pca.inverse_transform(new_data)

        # turn data into a 6x8 grid
        new_data = new_data.reshape((6, 8, -1))
        real_data = digits.data[:48].reshape((6, 8, -1))
Пример #6
0
#        np.place(vec, abs(vec)<1, 1)
#        np.place(vec, abs(vec)>1, 0)
#        return vec
#        
#    def densite_estime(X, data, h):
#        N = data.shape[0]
#        return((1/2*N*h) * sum(is_within_the_hypercube((X-data)/h)))
#    
#    def hist_noyaux_boxcar(vec_X, data, bandwidth):
#        Y_densite = []
#        for i in range(vec_X.shape[0]):
#            Y_densite.append(densite_estime(vec_X[i,:], data, bandwidth))
#        return Y_densite/sum(Y_densite)
    
    X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis]
    kde_1 = KernelDensity(kernel='tophat', bandwidth=0.3).fit(X1)
    kde_2 = KernelDensity(kernel='tophat', bandwidth=1).fit(X1)
    kde_3 = KernelDensity(kernel='tophat', bandwidth=2).fit(X1)
    kde_4 = KernelDensity(kernel='tophat', bandwidth=5).fit(X1)
    
    kde_1_2 = KernelDensity(kernel='tophat', bandwidth=0.3).fit(X2)
    kde_2_2 = KernelDensity(kernel='tophat', bandwidth=1).fit(X2)
    kde_3_2 = KernelDensity(kernel='tophat', bandwidth=2).fit(X2)
    kde_4_2 = KernelDensity(kernel='tophat', bandwidth=5).fit(X2)
    
    fig, ax = pyplot.subplots(2, 1, sharex=True, sharey=True)
    fig.subplots_adjust(hspace=0.4, wspace=0.05)
    
    ax[0].plot(X_plot, np.exp(kde_1.score_samples(X_plot)), label="bandwidth=0.3")
    ax[0].plot(X_plot, np.exp(kde_2.score_samples(X_plot)), label="bandwidth=1")
    ax[0].plot(X_plot, np.exp(kde_3.score_samples(X_plot)), label="bandwidth=2")
Пример #7
0
           weights=model.layers[4].get_weights()))
encoder_replica.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
encoder_replica.summary()

# The SKLearn kernel density function only works with 1D arrays so we need to flatten the tensors created by the encoder
encoded_images = encoder_replica.predict_generator(train_generator)
encoded_images_flat = [np.reshape(img, (27)) for img in encoded_images]

validation_encoded = encoder_replica.predict_generator(validation_generator)
val_enc_flat = [np.reshape(img, (27)) for img in validation_encoded]

anom_encoded = encoder_replica.predict_generator(anomaly_generator)
anom_enc_flat = [np.reshape(img, (27)) for img in anom_encoded]

# Kernel Density Estimation of the encoded vectors
kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(encoded_images_flat)
training_density_scores = kde.score_samples(encoded_images_flat)
validation_density_scores = kde.score_samples(val_enc_flat)
anomaly_density_scores = kde.score_samples(anom_enc_flat)

# Plotting the density distributions of the training (normal), validation (normal) and anomalous images
# Ideally we want to see high separation between the normal and anomalous classes
plt.figure(figsize=(10, 7))
plt.title('Distribution of Density Scores')
plt.hist(training_density_scores, 12, alpha=0.5, label='Training Normal')
plt.hist(validation_density_scores, 12, alpha=0.5, label='Validation Normal')
plt.hist(anomaly_density_scores, 12, alpha=0.5, label='Anomalies')
plt.legend(loc='upper right')
plt.xlabel('Density Score')
plt.show()
Пример #8
0
# Data Selection
no_transaction = X[:, 1]  # Frequency
sum_amounts = X[:, 2]  # Money

# Plot a 1D density example
N = 100
np.random.seed(1)
N = no_transaction.shape[0]
X = no_transaction[:,
                   np.newaxis]  #np.random.normal(0, 1, 0.3 * N)[:, np.newaxis]
X_plot = np.linspace(np.min(X), np.max(X), 1000)[:, np.newaxis]

fig, ax = plt.subplots()

for kernel in ['gaussian', 'tophat', 'epanechnikov']:
    kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X)
    log_dens = kde.score_samples(X_plot)
    ax.plot(X_plot[:, 0],
            np.exp(log_dens),
            '-',
            label="kernel = '{0}'".format(kernel))

ax.text(6, 0.38, "N={0} points".format(N))

ax.legend(loc='upper left')
#ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k')

#ax.set_xlim(-4, 9)
#ax.set_ylim(-0.02, 0.4)
plt.show()
Пример #9
0
    # load decoder
    decoder_name = encoder_name.replace('encoder', 'decoder')
    with open(decoder_name) as fl:
        decoder = model_from_yaml(fl)

    decoder.load_weights(decoder_name[:-4] + 'h5')
    target_seqs = decoder.predict(target_latents, batch_size=1000)

    generated_seqs = target_seqs[:,::10,:]
    X_test=X_test.reshape(X_test.shape[0],X_test.shape[1]*X_test.shape[2])
    generated_seqs=generated_seqs.reshape(generated_seqs.shape[0],generated_seqs.shape[1]*generated_seqs.shape[2])

    if args.bandwidth is None:
        ##grid search
        params = {'bandwidth': np.logspace(-1, 0., 10)}
        grid = GridSearchCV(KernelDensity(), params, cv=3, verbose=1)
        X_search = np.random.permutation(X)[:10000,::10,:]
        X_search = X_search.reshape(X_search.shape[0],X_search.shape[1]*X_search.shape[2])
        grid_result = grid.fit(X_search)


        print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
        for params, mean_score, scores in grid_result.grid_scores_:
            print("scores.mean:%f (score.std:%f) with: %r" % (scores.mean(), scores.std(), params))
        #bandwidth = 0.25
        bandwidth = grid_result.best_params_
    else:
        bandwidth = args.bandwidth

    ParzenWindow = KernelDensity(bandwidth=bandwidth, algorithm='auto', kernel='gaussian', metric='euclidean')
    print "shape of generated_seqs is {}".format(generated_seqs.shape)
Пример #10
0
def doublet_finder(ds: loompy.LoomConnection,
                   use_pca: bool = False,
                   proportion_artificial: float = 0.20,
                   fixed_th: float = None,
                   k: int = None,
                   name: object = "tmp",
                   qc_dir: object = ".",
                   graphs: bool = True,
                   max_th: float = 1) -> np.ndarray:
    # Step 1: Generate artificial doublets from input
    logging.debug("Creating artificial doublets")
    n_real_cells = ds.shape[1]
    n_doublets = int(n_real_cells / (1 - proportion_artificial) - n_real_cells)
    doublets = np.zeros((ds.shape[0], n_doublets))
    for i in range(n_doublets):
        a = np.random.choice(ds.shape[1])
        b = np.random.choice(ds.shape[1])
        doublets[:, i] = ds[:, a] + ds[:, b]

    data_wdoublets = np.concatenate((ds[:, :], doublets), axis=1)

    logging.debug("Feature selection and dimensionality reduction")
    genes = FeatureSelectionByVariance(2000).fit(ds)
    if use_pca:
        # R function uses log2 counts/million
        f = np.divide(data_wdoublets.sum(axis=0), 10e6)
        norm_data = np.divide(data_wdoublets, f)
        norm_data = np.log(norm_data + 1)
        pca = PCA(n_components=50).fit_transform(norm_data[genes, :].T)
    else:
        data = sparse.coo_matrix(data_wdoublets[genes, :]).T
        hpf = HPF(k=64,
                  validation_fraction=0.05,
                  min_iter=10,
                  max_iter=200,
                  compute_X_ppv=False)
        hpf.fit(data)
        theta = (hpf.theta.T / hpf.theta.sum(axis=1)).T

    if k is None:
        k = int(np.min([100, ds.shape[1] * 0.01]))

    logging.info(f"Initialize NN structure with k = {k}")
    if use_pca:
        knn_result = NearestNeighbors(n_neighbors=k,
                                      metric='euclidean',
                                      n_jobs=4)
        knn_result.fit(pca)
        knn_dist, knn_idx = knn_result.kneighbors(X=pca, return_distance=True)

        num = ds.shape[1]
        knn_result1 = NearestNeighbors(n_neighbors=k,
                                       metric='euclidean',
                                       n_jobs=4)
        knn_result1.fit(pca[0:num, :])
        knn_dist1, knn_idx1 = knn_result1.kneighbors(X=pca[num + 1:, :],
                                                     n_neighbors=10)
        knn_dist_rc, knn_idx_rc = knn_result1.kneighbors(X=pca[0:num, :],
                                                         return_distance=True)

    else:
        knn_result = NNDescent(data=theta, metric=jensen_shannon_distance)
        knn_idx, knn_dist = knn_result.query(theta, k=k)

        num = ds.shape[1]
        knn_result1 = NNDescent(data=theta[0:num, :],
                                metric=jensen_shannon_distance)
        knn_idx1, knn_dist1 = knn_result1.query(theta[num + 1:, :], k=10)
        knn_idx_rc, knn_dist_rc = knn_result1.query(theta[0:num, :], k=k)

    dist_th = np.mean(knn_dist1.flatten()) + 1.64 * np.std(knn_dist1.flatten())

    doublet_freq = np.logical_and(knn_idx > ds.shape[1], knn_dist < dist_th)
    doublet_freq_A = doublet_freq[ds.shape[1]:ds.shape[1] + n_doublets, :]
    mean1 = doublet_freq_A.mean(axis=1)
    mean2 = doublet_freq_A[:, 0:int(np.ceil(k / 2))].mean(axis=1)
    doublet_score_A = np.maximum(mean1, mean2)

    doublet_freq = doublet_freq[0:ds.shape[1], :]
    mean1 = doublet_freq.mean(axis=1)
    mean2 = doublet_freq[:, 0:int(np.ceil(k / 2))].mean(axis=1)
    doublet_score = np.maximum(mean1, mean2)
    doublet_flag = np.zeros(ds.shape[1], int)
    doublet_th1 = 1
    doublet_th2 = 1
    doublet_th = 1
    #Infer TH from the data or use fixed TH

    # instantiate and fit the KDE model
    kde = KernelDensity(bandwidth=0.1, kernel='gaussian')
    kde.fit(doublet_score_A[:, None])

    # score_samples returns the log of the probability density
    xx = np.linspace(doublet_score_A.min(), doublet_score_A.max(),
                     len(doublet_score_A)).reshape(-1, 1)

    logprob = kde.score_samples(xx)
    if fixed_th is not None:
        doublet_th = float(fixed_th)
    else:
        #Check if the distribution is bimodal
        intervals = UniDip(np.exp(logprob)).run()
        if (len(intervals) > 1):
            kmeans = KMeans(n_clusters=2).fit(
                doublet_score_A.reshape(len(doublet_score_A), 1))
            high_cluster = np.where(
                kmeans.cluster_centers_ == max(kmeans.cluster_centers_))[0][0]
            doublet_th1 = np.around(np.min(
                doublet_score_A[kmeans.labels_ == high_cluster]),
                                    decimals=3)

        #0.5% for every 1000 cells - the rate of detectable doublets by 10X
        doublet_th2 = np.percentile(doublet_score, 100 - (5e-4 * ds.shape[1]))
        doublet_th2 = np.around(doublet_th2, decimals=3)
        #The TH shouldn't be higher than indicated
        if doublet_th2 > max_th:
            doublet_th2 = max_th
        if doublet_th1 > max_th:
            doublet_th1 = max_th
        if (len(np.where(doublet_score >= doublet_th1)[0]) >
            (len(np.where(doublet_score >= doublet_th2)[0]))):
            doublet_th = doublet_th2
        else:
            doublet_th = doublet_th1
    doublet_flag[doublet_score >= doublet_th] = 1

    #Calculate the score for the cells that are nn of the marked doublets
    if use_pca:
        pca_rc = pca[0:num, :]
        knn_dist1_rc, knn_idx1_rc = knn_result1.kneighbors(
            X=pca_rc[doublet_flag == 1, :],
            n_neighbors=10,
            return_distance=True)
    else:
        theta_rc = theta[0:num, :]
        knn_idx1_rc, knn_dist1_rc = knn_result1.query(
            theta_rc[doublet_flag == 1, :], k=10)

    dist_th = np.mean(
        knn_dist1_rc.flatten()) + 1.64 * np.std(knn_dist1_rc.flatten())
    doublet2_freq = np.logical_and(doublet_flag[knn_idx_rc] == 1,
                                   knn_dist_rc < dist_th)
    doublet2_nn = knn_dist_rc < dist_th
    doublet2_score = doublet2_freq.sum(axis=1) / doublet2_nn.sum(axis=1)

    doublet_flag[np.logical_and(doublet_flag == 0,
                                doublet2_score >= doublet_th / 2)] = 2

    if graphs:

        if (use_pca):
            ds.ca.PCA = pca[0:ds.shape[1], :]
        else:
            ds.ca.HPF = theta[0:ds.shape[1], :]
        doublets_plots.plot_all(ds,
                                out_file=os.path.join(qc_dir + "/" + name +
                                                      "_doublets.png"),
                                labels=doublet_flag,
                                doublet_score_A=doublet_score_A,
                                logprob=logprob,
                                xx=xx,
                                score1=doublet_th1,
                                score2=doublet_th2,
                                score=doublet_th)

    logging.info(
        f"Doublet fraction: {100*len(np.where(doublet_flag>0)[0])/ds.shape[1]:.2f}%, {len(np.where(doublet_flag>0)[0])} cells. \n\t\t\t(Expected detectable doublet fraction: {(5e-4*ds.shape[1]):.2f}%)"
    )

    return doublet_score, doublet_flag
Пример #11
0
def plot_pfam_familysizes(pfam_df, plot_dir):

    # define counts for PFAmilies with and without annotated PDB structures
    struct = np.log(pfam_df.query('nr_structures > 0')['nr_sequences'].values)
    no_struct = np.log(
        pfam_df.query('nr_structures == 0')['nr_sequences'].values)

    # define grid for kernel density estimation
    x_grid = np.linspace(np.min(struct.tolist() + no_struct.tolist()),
                         np.max(struct.tolist() + no_struct.tolist()), 500)
    bandwidth = 0.3

    #define colors for struct and no_struct
    colors = ['rgb(22, 96, 167)', 'rgb(205, 12, 24)']
    colors = ['rgb(170, 221, 172)', 'rgb(3, 177, 74)']
    colors = ['rgb(170, 170, 170)', 'rgb(0,0,0)']

    # kernel density estimate for Pfamilies with annotated structure
    kde = KernelDensity(kernel='gaussian',
                        bandwidth=bandwidth).fit(struct.reshape(-1, 1))
    struct_density = np.exp(kde.score_samples(x_grid.reshape(-1, 1)))
    struct_density_normalized_counts = len(struct) / np.sum(
        struct_density) * struct_density

    # kernel density estimate for Pfamilies without annotated structure
    kde = KernelDensity(kernel='gaussian',
                        bandwidth=bandwidth).fit(no_struct.reshape(-1, 1))
    nostruct_density = np.exp(kde.score_samples(x_grid.reshape(-1, 1)))
    nostruct_density_normalized_counts = len(no_struct) / np.sum(
        nostruct_density) * nostruct_density

    ### add plot traces for struct
    trace_kde_struct = go.Scatter(
        x=x_grid,
        y=struct_density_normalized_counts,
        mode='lines',
        line=dict(color=colors[0], width=4),
        name="<b>with</b> structural <br>annotation (" + str(len(struct)) +
        ")")

    ### add plot traces for struct
    trace_kde_nostruct = go.Scatter(
        x=x_grid,
        y=nostruct_density_normalized_counts,
        mode='lines',
        line=dict(color=colors[1], width=4),
        name="<b>lacking</b> structural <br>annotation (" +
        str(len(no_struct)) + ")")

    # add vertical line for median of family size for families with structures
    median_struct = np.median(struct)
    trace_median_struct = go.Scatter(
        x=[median_struct, median_struct],
        y=[
            0,
            np.max([
                np.max(struct_density_normalized_counts),
                np.max(nostruct_density_normalized_counts)
            ])
        ],
        mode='lines+text',
        name="median family size",
        textfont=dict(family='sans serif', size=18, color=colors[0]),
        text=[
            "", " median: " + str(np.round(np.exp(median_struct), decimals=3))
        ],
        textposition='right',
        line=dict(color=colors[0], width=4, dash='dash'),
        showlegend=False)

    # add vertical line for median of family size for families with NO structures
    median_nostruct = np.median(no_struct)
    trace_median_nostruct = go.Scatter(
        x=[median_nostruct, median_nostruct],
        y=[
            0,
            np.max([
                np.max(struct_density_normalized_counts),
                np.max(nostruct_density_normalized_counts)
            ])
        ],
        mode='lines+text',
        name="median family size",
        line=dict(color=colors[1], width=4, dash='dash'),
        textfont=dict(family='sans serif', size=18, color=colors[1]),
        text=[
            "", "median: " +
            str(np.round(np.exp(median_nostruct), decimals=3)) + " "
        ],
        textposition="left",
        showlegend=False)

    data = [
        trace_kde_nostruct, trace_median_nostruct, trace_kde_struct,
        trace_median_struct
    ]
    layout = go.Layout(
        xaxis=dict(title='number of sequences per family',
                   tickvals=np.log([10, 100, 1000, 10000, 100000]),
                   ticktext=[
                       "$10^1$",
                       "$10^2$",
                       "$10^3$",
                       "$10^4$",
                       "$10^5$",
                   ],
                   exponentformat="e",
                   showexponent='All',
                   zeroline=False),
        yaxis=dict(title='number of protein families',
                   exponentformat="e",
                   showexponent='All',
                   zeroline=False),
        font=dict(size=18),
        legend=dict(x=0.75, y=0.88,
                    orientation="v"),  #horizontal legend below the plot
        title="PFAM family sizes <br> Pfam 31.0 (March 2017, 16712 entries)")

    #define plot figure
    fig = go.Figure(data=data, layout=layout)

    #plot with title
    plot_out = plot_dir + "/pfam_pdb.html"
    #plotly_plot(fig, filename=plot_out, auto_open=False)
    with_jax(fig, filename=plot_out)

    #plot without title
    fig['layout']['title'] = ""
    fig['layout']['margin']['t'] = 10
    fig['layout']['margin']['b'] = 150
    plot_out = plot_dir + "/pfam_pdb_notitle.html"

    #plotly_plot(fig, filename=plot_out, auto_open=False)
    with_jax(fig, filename=plot_out)
Пример #12
0

def best_split(data, I=(-np.inf, np.inf)):
    '''With bimodal data, finding split at lowest density.'''
    h_crit = critical_bandwidth_m_modes(data, 2, I)
    kde = KernelDensity(kernel='gaussian',
                        bandwidth=h_crit).fit(data.reshape(-1, 1))
    x = np.linspace(max(np.min(data), I[0]), min(np.max(data), I[1]), 200)
    y = np.exp(kde.score_samples(x.reshape(-1, 1)))
    modes = argrelextrema(np.hstack([[0], y, [0]]), np.greater)[0]
    if len(modes) != 2:
        raise ValueError("{} modes at: {}".format(len(modes), x[modes - 1]))
    ind_min = modes[0] - 1 + argrelextrema(y[(modes[0] - 1):(modes[1] - 1)],
                                           np.less)[0]
    return x[ind_min]


if __name__ == '__main__':
    import matplotlib.pyplot as plt
    if 1:
        N = 1000
        data = np.hstack([np.random.randn(N / 2), np.random.randn(N / 4) + 4])
        h_crit = critical_bandwidth_m_modes(data, 2)
        x = np.linspace(-3, 8)
        y = KernelDensity(kernel='gaussian', bandwidth=h_crit).fit(
            data.reshape(-1, 1)).score_samples(x.reshape(-1, 1))
        fig, ax = plt.subplots()
        ax.plot(x, np.exp(y))
        ax.axvline(best_split(data, (1, 4)), color='red')
        plt.show()
Пример #13
0
        rho[i,0]=round(rho[i,0],3)
    df_confidence=pd.DataFrame(Index,columns=['Index'])
    df_confidence['Ident']=Ident
    df_confidence['Ypred']=Ypred
    df_confidence['rho']=rho
    df_confidence['Yoriginal']=Yoriginal
    
    # plot the histogram
    ff.rhoHist(rho,n_equal_bins=100)   
    
    #X_plot = np.linspace(-1, 1, 100)[:, np.newaxis]
    X_plot = np.linspace(-1, 1, 100)
    X_plot = X_plot.reshape((-1,1))
    #bins = np.linspace(-1, 1, 50)

    fig, ax = plt.subplots(figsize=(8, 4))
   
    # tophat KDE
    #kde = KernelDensity(kernel='tophat', bandwidth=0.1).fit(rho)
    kde = KernelDensity(kernel='gaussian', bandwidth=0.04).fit(rho)
    log_dens = kde.score_samples(X_plot)
    ax.fill(X_plot, np.exp(log_dens), fc='#AAAAFF')
    #ax.fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF')
    #ax[1, 0].text(-3.5, 0.31, "Tophat Kernel Density")
    
    # calculate falserate according to different value of epsilon
    # eps==> maximum value of epsilon
    # num_eps==> the number of epsilon
    falseRate = ff.Predrejection(df_confidence,eps=0.8,num_eps=100) 

Пример #14
0
#### SKLEARN KDE ##################################
from sklearn.neighbors import KernelDensity
#DELETE xyz = np.vstack([xi,yi,zi])

#original
d = values2.shape[0]  #num dimensions? should be 3 here
n = values2.shape[1]  #num samples?
bwsklearn = (n * (d + 2) / 4.)**(-1. / (d + 4))  # silverman
#bw = n**(-1./(d+4)) # scott
print('SKLEARN bw (silverman): {}'.format(bwsklearn))

kde2 = KernelDensity(
    bandwidth=bwsklearn,
    metric='minkowski',  #'euclidean',#
    kernel='gaussian',
    algorithm='ball_tree').fit(
        values2.T, y=None,
        sample_weight=None)  #Should have shape (n_samples, n_features)
#out42 = kde2.fit(values2.T, y=None, sample_weight=None) #Should have shape (n_samples, n_features)

# xmin = np.min(xi)
# xmax = np.max(xi)
# ymin = np.min(yi)
# ymax = np.max(yi)
# zmin = np.min(zi)
# zmax = np.max(zi)
#positions = np.vstack([xi.ravel(), yi.ravel(), zi.ravel()])

#DELETE X, Y, Z = np.mgrid[xmin:xmax:50j, ymin:ymax:50j, zmin:zmax:50j]
#DELETE positions = np.vstack([X.ravel(), Y.ravel(), Z.ravel()])
Пример #15
0
from sklearn.datasets import load_digits
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

# load the data
digits = load_digits()

# project the 64-dimensional data to a lower dimension
pca = PCA(n_components=15, whiten=False)
data = pca.fit_transform(digits.data)

# use grid search cross-validation to optimize the bandwidth
params = {'bandwidth': np.logspace(-1, 1, 20)}
grid = GridSearchCV(KernelDensity(), params, cv=5, iid=False)
grid.fit(data)

print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))

# use the best estimator to compute the kernel density estimate
kde = grid.best_estimator_

# sample 44 new points from the data
new_data = kde.sample(44, random_state=0)
new_data = pca.inverse_transform(new_data)

# turn data into a 4x11 grid
new_data = new_data.reshape((4, 11, -1))
real_data = digits.data[:44].reshape((4, 11, -1))
Пример #16
0
def _bivariate_kdeplot(x,
                       y,
                       xscale=None,
                       yscale=None,
                       shade=False,
                       bw="scott",
                       gridsize=50,
                       cut=3,
                       clip=None,
                       legend=True,
                       legend_data=None,
                       **kwargs):

    ax = plt.gca()
    label = kwargs.pop('label', None)

    # Determine the clipping
    clip = [(-np.inf, np.inf), (-np.inf, np.inf)]

    x = xscale(x)
    y = yscale(y)

    x_nan = np.isnan(x)
    y_nan = np.isnan(y)

    x = x[~(x_nan | y_nan)]
    y = y[~(x_nan | y_nan)]

    if bw == 'scott':
        bw_x = bw_scott(x)
        bw_y = bw_scott(y)
        bw = (bw_x + bw_y) / 2
    elif bw == 'silverman':
        bw_x = bw_silverman(x)
        bw_y = bw_silverman(y)
        bw = (bw_x + bw_y) / 2
    elif isinstance(bw, float):
        bw_x = bw_y = bw
    else:
        raise util.CytoflowViewError(
            None, "Bandwith must be 'scott', 'silverman' or a float")

    kde = KernelDensity(bandwidth=bw,
                        kernel='gaussian').fit(np.column_stack((x, y)))

    x_support = _kde_support(x, bw_x, gridsize, cut, clip[0])
    y_support = _kde_support(y, bw_y, gridsize, cut, clip[1])

    xx, yy = np.meshgrid(x_support, y_support)
    z = kde.score_samples(np.column_stack((xx.ravel(), yy.ravel())))
    z = z.reshape(xx.shape)
    z = np.exp(z)

    n_levels = kwargs.pop("n_levels", 10)
    color = kwargs.pop("color")
    kwargs['colors'] = (color, )

    min_alpha = kwargs.pop("min_alpha", 0.2)
    if shade:
        min_alpha = 0

    max_alpha = kwargs.pop("max_alpha", 0.9)

    x_support = xscale.inverse(x_support)
    y_support = yscale.inverse(y_support)
    xx, yy = np.meshgrid(x_support, y_support)

    contour_func = ax.contourf if shade else ax.contour
    try:
        cset = contour_func(xx, yy, z, n_levels, **kwargs)
    except ValueError as e:
        raise util.CytoflowViewError(
            None, "Something went wrong in {}, bandwidth = {}.  ".format(
                contour_func.__name__, bw)) from e
    num_collections = len(cset.collections)

    alpha = np.linspace(min_alpha, max_alpha, num=num_collections)
    for el in range(num_collections):
        cset.collections[el].set_alpha(alpha[el])

    # Label the axes
    if hasattr(x, "name") and legend:
        ax.set_xlabel(x.name)
    if hasattr(y, "name") and legend:
        ax.set_ylabel(y.name)

    if label is not None:
        ax.set_title(label)

    # Add legend data
    if 'label' in kwargs:
        legend_data[kwargs['label']] = plt.Rectangle((0, 0), 1, 1, fc=color)

    return ax
Пример #17
0
import numpy as np
from scipy import stats
import matplotlib.pyplot as pltV
from sklearn.neighbors import KernelDensity

fig, plt = pltV.subplots(1, 1)

lamb = 1.5
t = stats.expon.rvs(size=20, scale=1 / lamb)
c = stats.expon.rvs(size=80, scale=1 / lamb)
j = stats.expon.rvs(size=150, scale=1 / lamb)

print(t)
print(c)
print(j)
print(stats.expon.fit(t))
print(stats.expon.fit(c))
print(stats.expon.fit(j))

x242 = np.linspace(0, 8).reshape(-1, 1)
expo1 = stats.expon.pdf(x242)
plt.plot(expo1, 'r-')

kde1 = KernelDensity(kernel='exponential').fit(x242)
norm2412 = np.exp(kde1.score_samples(x242))

plt.plot(norm2412)
plt.plot(stats.expon.cdf(expo1), 'g-')
pltV.show()
Пример #18
0
def bw_kde(data=[],start=0.01,end=1.0,cv_size=20):
    grid = GridSearchCV(KernelDensity(),{'bandwidth': np.linspace(start,end,cv_size)},cv=cv_size)
    grid.fit(data[:, None])
    return grid.best_params_
Пример #19
0
    'blizzard': ('2015-01-26 00:00:00', '2015-01-28 00:00:00')
}
taxi['event'] = np.zeros(len(taxi))
for event, duration in events.items():
    start, end = duration
    taxi.loc[start:end, 'event'] = 1

for event, duration in events.items():
    start, end = duration
    print("a")

y = y.reshape(y.shape[0], 1)
y = scale(y)

# KDE
kernaldens = KernelDensity(kernel="gaussian", bandwidth=0.75).fit(y)
print(kernaldens)

scores = kernaldens.score_samples(y)
thresh = quantile(scores, .01)
print(thresh)
index = where(scores <= thresh)
values = y[index]

x_ax = range(y.shape[0])
plt.plot(x_ax, y)
plt.scatter(index, values, color='r')
plt.show()

# TOOLTIPS = [
#     ("index", "$index"),
Пример #20
0
 def __init__(self, n_jobs=1, cv=5, bw=np.linspace(0.1, 1.0, 10)):
     self.grid = GridSearchCV(KernelDensity(), {'bandwidth': bw},
                              cv=cv,
                              n_jobs=n_jobs)  # 20-fold cross-validation
Пример #21
0
import matplotlib.pyplot as plt
from distutils.version import LooseVersion
from scipy.stats import norm
from sklearn.neighbors import KernelDensity

with open("repeated/BOHB_config/usage.json", encoding='utf-8') as f:
    data = json.load(f)
    _x = []
    for params in data:
        channel_2_num = params["config"]["channel_2_num"] + 32
        _x.append(channel_2_num)

    X = np.concatenate((_x, []))[:, np.newaxis]

    X_plot = np.linspace(30, 70, 1000)[:, np.newaxis]

    fig, ax = plt.subplots()

    kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(X)
    log_dens = kde.score_samples(X_plot)
    ax.plot(X_plot[:, 0],
            np.exp(log_dens),
            '-',
            label="channel_2_num range= '{0}'".format('[32, 64]'))

    ax.legend(loc='upper left')
    ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k')

    ax.set_xlim(30, 70)
    ax.set_ylim(0, 0.3)
    plt.show()
 def _sample_posteriors(self, arm):
     kde = KernelDensity()
     kde.fit(
         pd.DataFrame(
             self.trace[arm]['mu'][-(self.samples_num - self.burn_num):]))
     return float(kde.sample())
n_samples = 1000
linear_gaussian_net = linear_gaussian_generation.generate_sparse_linear_gaussian_system(n_vars, max_deg, (0.2, 1), (-1, 1))

X = np.asarray(linear_gaussian_net.get_joint_samples(n_samples)).astype(np.float64)

#print("X: ", X)
'''
X = load_data.load_kde_cleaned_airline_data("Iberia").to_numpy()
#X = X[:1000, :]

X_train = X[:int(0.7 * X.shape[0]), :]
X_test = X[int(0.7 * X.shape[0]):, :]
bandwidth = silverman_scalar_bandwidth(X_train)
print("bandwidth: ", bandwidth)
kde_on_X = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(X_train)
kde_on_X_test_log_likelihood = np.sum(kde_on_X.score_samples(X_test))
normal_dist_on_X = stats.multivariate_normal(mean=np.mean(X_train, axis=0),
                                             cov=np.cov(X_train.T))
normal_dist_on_X_test_log_likelihood = np.sum(normal_dist_on_X.logpdf(X_test))
print("X shape: ", X.shape)

initial_dags = [
    random_graph.random_dag(X.shape[1], max_deg) for i in range(0, 1)
]

kernel = 'gaussian'

print("kde_on_X_test_log_likelihood: ", kde_on_X_test_log_likelihood)
print("normal dist test log likelihood: ",
      normal_dist_on_X_test_log_likelihood)
Пример #24
0
def optimize_bd(dfGenome, dfPos, dfGene, outpath):

    "Bandwidth optimization by fitting the density to positive set"

    dfPos['mid'] = ((dfPos['end'] - dfPos['start']) / 2) + dfPos['start']

    chrs = list(dfGenome.chrom.unique())

    bdlist = list(np.linspace(1000, 1000000, 1000))

    sc = np.array([0.0] * (len(bdlist) + 1))

    for chrname in chrs:

        chrlen = int(dfGenome[dfGenome.chrom == chrname].length)

        N = dfPos[dfPos.chrom == chrname].shape[0]

        dfchr = dfGene[dfGene.chrom == chrname]

        dfPosChr = dfPos[dfPos.chrom == chrname]

        Xp = np.array(list(dfPosChr['mid']))[:, np.newaxis]

        X = np.array(list(dfchr['mid']))[:, np.newaxis]

        ## estimate the density at each 1000 bp

        X_plot = np.linspace(0, chrlen, int(chrlen / 1000))[:, np.newaxis]

        b = np.array([[0, 0]])

        print("optimization for", chrname)

        for bd in bdlist:

            kde = KernelDensity(kernel='gaussian', bandwidth=bd).fit(X)

            a = np.c_[bd, kde.score(Xp)]

            b = np.r_[b, a]

        sc[:] = sc[:] + b[:, 1]

    end = np.c_[bdlist, list(sc[1:, ])]

    idxrow = np.argwhere(end == max(end[:, 1]))[0, 0]
    newbd = int(end[idxrow, 0])

    print("the bandwith is", newbd)

    #plt.plot(bdlist, list(sc[1:,]))
    #plt.title("genome")
    #plt.xlabel("bandwidth (bp)")
    #plt.ylabel("log score of positive set")
    #plt.savefig(path + 'gene_density_optimization.png')
    #plt.close()

    dfout = pd.DataFrame({'A': bdlist, 'B': sc[1:, ]})

    dfout.to_csv(path_or_buf=outpath + "bandwidth_trials.txt",
                 sep='\t',
                 header=False,
                 index=False)

    return newbd
Пример #25
0
        sim_2niso_V = sim_2niso_V.reshape((LX, LX))

        norm_2niso = np.sqrt((sim_2niso_U**2 + sim_2niso_V**2) / float(c2))
        norm_2iso = np.sqrt((sim_2iso_U**2 + sim_2iso_V**2) / float(c2))

        norm_2iso_histo = norm_2iso.reshape([LXA])
        norm_2niso_histo = norm_2niso.reshape([LXA])

        u_bins_iso_E6[G] = np.linspace(np.log(np.amin(norm_2iso_histo)),
                                       np.log(np.amax(norm_2iso_histo)), 2**12)

        u_bins_niso_E6[G] = np.linspace(np.log(np.amin(norm_2niso_histo)),
                                        np.log(np.amax(norm_2niso_histo)),
                                        2**12)

        kde_iso = KernelDensity(bandwidth=0.25, kernel='gaussian')
        kde_iso.fit(np.log(norm_2iso_histo)[:, None])
        logprob_iso_E6[G] = kde_iso.score_samples(u_bins_iso_E6[G][:, None])

        kde_niso = KernelDensity(bandwidth=0.25, kernel='gaussian')
        kde_niso.fit(np.log(norm_2niso_histo)[:, None])
        logprob_niso_E6[G] = kde_niso.score_samples(u_bins_niso_E6[G][:, None])

    MDH.PushData(data=u_bins_iso_E6, key='u_bins_iso' + 'E6')
    MDH.PushData(data=u_bins_niso_E6, key='u_bins_niso' + 'E6')
    MDH.PushData(data=logprob_iso_E6, key='logprob_iso' + 'E6')
    MDH.PushData(data=logprob_niso_E6, key='logprob_niso' + 'E6')

    for G in [-1.75, -3.6]:
        print("G: ", G)
        sim_2iso_u = UFields['G' + str(G) + 'LX' + str(LX) + 'E8P4']
Пример #26
0
     lastpdfhf, lastpdfcv = None, None
     for ii in [np.where(x == TEST)[0][0]]:
         thesex, thesey, vesey = [], [], []
         for jj in range(epsh.shape[1]):
             ys.append(epsh[ii, jj])
             y2s.append(epscv[ii, jj])
             xs.append(x[ii])
             thesey.append(y2s[-1])
             vesey.append(ys[-1])
         np.save('thesey%i' % BS, thesey)
         thesey = np.array(thesey)
         vesey = np.array(vesey)
         xx = np.linspace(np.min(vesey), max(vesey), 100)
         #xx = np.linspace(min(vesey), max(vesey), 100)
         print('---->', min(thesey), max(thesey))
         kde2 = KernelDensity(kernel='gaussian', bandwidth=.05)
         kde2.fit(vesey.reshape(-1, 1))
         pdff = np.exp(kde2.score_samples(xx.reshape(-1, 1)))
         if lastpdfcv: cvDs.append(entropy(pdff, laspdfcv))
         lastpdfcv = pdff.copy()
         ax[0].plot(xx, pdff / simps(pdff, xx), label=BS, c=colors[ll])
         #xx = np.linspace(0, 2 * max(thesey), 1000)
         xx = np.linspace(min(thesey), max(thesey), 100)
         kde = KernelDensity(kernel='gaussian', bandwidth=.01)
         kde.fit(thesey.reshape(-1, 1))
         pdff = np.exp(kde.score_samples(xx.reshape(-1, 1)))
         if lastpdfhf: hfDs.append(entropy(pdff, laspdfhf))
         lastpdfhf = pdff.copy()
         ax[1].plot(xx, pdff / simps(pdff, xx), label=BS, c=colors[ll])
 ax[0].hist(vesey, 200, alpha=0.3, normed=True)
 ax[1].hist(thesey, 200, alpha=0.3, normed=True)
Пример #27
0
def extract_profiles_union(global_data,target_ind_dict,threshold,P):
    ## estimate the bandwith
    params = {'bandwidth': np.linspace(np.min(global_data), np.max(global_data),20)}
    grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)

    ## perform MeanShift clustering.
    combine= {}
    for bull in target_ind_dict.keys():
        grid.fit(global_data[target_ind_dict[bull],:])
        combine[bull]= grid.best_estimator_    

    Stats= recursively_default_dict()

    for combo in it.combinations(target_ind_dict.keys(),2):
        pop1= combo[0]
        pop2= combo[1]

        All_coords= [x for x in it.chain(*[target_ind_dict[z] for z in combo])]

        Quanted_set= global_data[All_coords,:]

        i_coords, j_coords, z_coords = np.meshgrid(np.linspace(min(Quanted_set[:,0]),max(Quanted_set[:,0]),P),
                              np.linspace(min(Quanted_set[:,1]),max(Quanted_set[:,1]),P),
                                np.linspace(min(Quanted_set[:,2]),max(Quanted_set[:,2]),P), indexing= 'ij')


        traces= [x for x in it.product(range(P),range(P),range(P))]

        background= np.array([i_coords,j_coords,z_coords])

        background= [background[:,c[0],c[1],c[2]] for c in traces]

        background=np.array(background)

        pop1_fist= combine[pop1].score_samples(background)
        #pop1_fist= np.exp(pop1_fist)
        P_dist_pop1= combine[pop1].score_samples(global_data[target_ind_dict[pop1],:])
        pop1_fist = scipy.stats.norm(np.mean(P_dist_pop1),np.std(P_dist_pop1)).cdf(pop1_fist)
        pop1_fist= [int(x >= threshold) for x in pop1_fist]
        
        pop2_fist= combine[pop2].score_samples(background)
        #pop2_fist= np.exp(pop2_fist)
        P_dist_pop2= combine[pop2].score_samples(global_data[target_ind_dict[pop2],:])
        pop2_fist = scipy.stats.norm(np.mean(P_dist_pop2),np.std(P_dist_pop2)).cdf(pop2_fist)
        pop2_fist= [int(x >= threshold) for x in pop2_fist]

        
        pop1_and_2= len([x for x in range(background.shape[0]) if pop1_fist[x] == 1 and pop2_fist[x] == 1])
        pop1_I_pop2= pop1_and_2 / float(sum(pop1_fist))
        pop2_I_pop1= pop1_and_2 / float(sum(pop2_fist))
        
        total_overlap= pop1_and_2 / float(sum(pop1_fist) + sum(pop2_fist) - pop1_and_2)
        
        empty_space= 1 - (sum(pop1_fist) + sum(pop2_fist) - pop1_and_2) / background.shape[0]
        
        Stats[combo][pop1]= pop1_I_pop2
        Stats[combo][pop2]= pop2_I_pop1
        Stats[combo]['empty']= empty_space
        Stats[combo]['PU']= total_overlap
        
    
    return Stats
Пример #28
0
    def calc_pdist(df,
                   columns=None,
                   mode="kde",
                   bandwidth=None,
                   grid=None,
                   **kwargs):
        """
        Calcualtes probability distribution over DataFrame.

        Arguments:
          df (DataFrame): DataFrame over which to calculate probability
            distribution of each column over rows
          columns (list): Columns for which to calculate probability
            distribution
          mode (ndarray, str, optional): Method of calculating
            probability distribution; eventually will support 'hist' for
            histogram and 'kde' for kernel density estimate, though
            presently only 'kde' is implemented
          bandwidth (float, dict, str, optional): Bandwidth to use for
            kernel density estimates; may be a single float that will be
            applied to all columns or a dictionary whose keys are column
            names and values are floats corresponding to the bandwidth
            for each column; for any column for which *bandwidth* is not
            specified, the standard deviation will be used
          grid (list, ndarray, dict, optional): Grid on which to
            calculate kernel density estimate; may be a single ndarray
            that will be applied to all columns or a dictionary whose
            keys are column names and values are ndarrays corresponding
            to the grid for each column; for any column for which *grid*
            is not specified, a grid of 1000 points between the minimum
            value minus three times the standard deviation and the
            maximum value plots three times the standard deviation will
            be used
          kde_kw (dict, optional): Keyword arguments passed to
            :function:`sklearn.neighbors.KernelDensity`
          verbose (int): Level of verbose output
          kwargs (dict): Additional keyword arguments

        Returns:
          OrderedDict: Dictionary whose keys are columns in *df* and
          values are DataFrames whose indexes are the *grid* for that
          column and contain a single column 'probability' containing
          the normalized probability at each grid point

        .. todo:
            - Implement flag to return single dataframe with single grid
        """
        from sklearn.neighbors import KernelDensity

        # Process arguments
        verbose = kwargs.get("verbose", 1)
        if verbose >= 1:
            wiprint("""Calculating probability distribution over DataFrame""")

        if mode == "kde":

            # Prepare bandwidths
            if bandwidth is None:
                bandwidth = df.values.std()

            # Prepare grids
            if grid is None:
                grid = np.linspace(df.values.min() - 3 * bandwidth,
                                   df.values.max() + 3 * bandwidth, 1000)
            elif isinstance(grid, list):
                grid = np.array(grid)

            # Calculate probability distributions
            kde_kw = kwargs.get("kde_kw", {})
            pdist = np.zeros((grid.size, df.columns.size))
            for i, column in enumerate(df.columns.values):
                series = df[column]
                if verbose >= 1:
                    wiprint(
                        "calculating probability distribution of "
                        "{0} using a kernel density estimate".format(column))
                kde = KernelDensity(bandwidth=bandwidth, **kde_kw)
                kde.fit(series.dropna()[:, np.newaxis])
                pdf = np.exp(kde.score_samples(grid[:, np.newaxis]))
                pdf /= pdf.sum()
                pdist[:, i] = pdf
            pdist = pd.DataFrame(pdist, index=grid, columns=df.columns)
        else:
            raise Exception(
                sformat("""only kernel density estimation is
                                    currently supported"""))

        return pdist
Пример #29
0
def DBRE_analyzer(filename):
    global df, reset_time, max_time, num_measurements, min_plateau_len, printplots, index
    try:  #to read the text file
        raw_data = pd.read_csv(filename + '.DTA',
                               sep='\t',
                               header=None,
                               usecols=[2, 3],
                               skiprows=64,
                               names=['Time', 'Voltage'])
    except:  #if file is empty, wait reset_time
        time.sleep(reset_time)
        return DBRE_analyzer(filename)

    #check again if file is empty, and if so, wait reset_time before retrying
    if raw_data.empty:
        time.sleep(reset_time)
        return DBRE_analyzer(filename)

    #extract date, time, charging time, then convert to hours elapsed
    experimentnumber = filename[index:]
    f = open(filename + '.DTA', 'r')
    lines = f.readlines()
    datestamp = lines[3].split('\t')[2]
    timestamp = lines[4].split('\t')[2]
    datetimestamp = datetime.strptime(datestamp + ' ' + timestamp,
                                      '%m/%d/%Y %H:%M:%S')
    dt = datetimestamp - start_time
    hours = dt.total_seconds() / 3600
    charging_time = float(lines[11].split('\t')[2])
    f.close()

    #Export raw discharge curve to Excel datafile
    raw_data.to_excel(filename + '.xlsx')

    #filter out times past the maximum time
    raw_data = raw_data[raw_data.Time <= max_time]

    #If datafile is non-physical, skip to next
    if any(abs(raw_data.Voltage) > voltage_lims[2]):
        new_number = int(experimentnumber) + 1
        if new_number > num_measurements:
            return 'Done'
        new_filename = filename[:index]
        new_filename = new_filename + str(new_number)
        return DBRE_analyzer(
            new_filename)  #recursive loop until all files parsed

    #Produce discharge plot
    if printplots:
        plt.figure()
        plt.suptitle('Discharge for run #' + experimentnumber)
        #VOLTAGE PLOT
        top = plt.subplot(2, 1, 1)
        plt.plot(raw_data.Time, raw_data.Voltage)
        plt.axis([
            -10, max_time,
            min(raw_data.Voltage), raw_data['Voltage'].iloc[-1] + 0.05
        ])
        plt.xlabel('Time (s)')
        plt.ylabel('Voltage (V)')

    #Filter out charging step
    raw_data = raw_data[raw_data.Time > charging_time]
    raw_data = raw_data.reset_index()
    #Stop if datafile is incomplete
    if raw_data.empty:
        return 'Done'

    #Use KDE to find plateau
    voltage_data = np.array(raw_data.Voltage)
    voltage_data = voltage_data.reshape(-1, 1)
    X_plot = np.linspace(np.amin(voltage_data), np.amax(voltage_data),
                         1000)[:, np.newaxis]
    kde = KernelDensity(bandwidth=0.01).fit(voltage_data)
    log_dens = kde.score_samples(X_plot)
    mi, ma = argrelextrema(log_dens,
                           np.less)[0], argrelextrema(log_dens, np.greater)[0]
    if len(mi) > 0:
        i = 0
        plateau = voltage_data[voltage_data < X_plot[mi[i]]]
        while len(plateau) < min_plateau_len and i + 1 < len(mi):
            plateau = voltage_data[np.logical_and(
                voltage_data < X_plot[mi[i + 1]],
                voltage_data > X_plot[mi[i]])]
            i += 1
        if len(plateau) < min_plateau_len and i + 1 == len(mi):
            plateau = voltage_data[voltage_data > X_plot[mi[i]]]
    else:
        plateau = voltage_data
    weights = wts(plateau)
    voltage = -np.average(plateau, weights=weights)
    uncertainty = tstd(plateau)

    #If result is non-physical, skip to next datafile
    if voltage > voltage_lims[1] or voltage < voltage_lims[0]:
        new_number = int(experimentnumber) + 1
        if new_number > num_measurements:
            return 'Done'
        new_filename = filename[:index]
        new_filename = new_filename + str(new_number)
        return DBRE_analyzer(
            new_filename)  #recursive loop until all files parsed

    #plot KDE curve
    if printplots:
        bottom = plt.subplot(2, 1, 2)
        plt.plot(X_plot[:, 0],
                 np.exp(log_dens),
                 color='darkviolet',
                 lw=2,
                 linestyle='-')
        plt.plot(voltage_data[:, 0],
                 -0.005 - 0.01 * np.random.random(voltage_data.shape[0]), '+k')
        plt.xlabel('Voltage (V)')
        plt.ylabel('Probability Density')
        plt.axis([np.amin(voltage_data), np.amax(voltage_data), -0.02, 2])
    # plt.axis([np.amin(voltage_data), np.amax(voltage_data), -0.02, np.exp(np.amax(log_dens))+0.02])

    #add plateau line to discharge plot
    if printplots:
        top.plot([-10, max_time], [-voltage, -voltage], '--k')
        #save the plot
        plt.savefig(filename + '.png', dpi=300)  # Save the figure
        plt.close()

    #add info to overall Excel file, DBRE_Summary.xlsx
    df = df.append(
        {
            'Hours': hours,
            'Date': datestamp,
            'Time': timestamp,
            'Potential': voltage,
            'Uncertainty': uncertainty
        },
        ignore_index=True)  #add values to overall dataframe
    df.to_excel('DBRE_Summary.xlsx')

    #plot salt potential over time after each trial is done
    plt.figure()
    plt.suptitle('Salt Potential Over Time')
    plt.errorbar(df.Hours,
                 df.Potential,
                 yerr=df.Uncertainty,
                 color='blue',
                 ecolor='black',
                 fmt='o',
                 capsize=5)
    plt.xlabel('Time (hr)')
    plt.ylabel('Salt Potential (V vs Be|Be2+)')
    plt.ticklabel_format(axis='x', style='plain', useOffset=False)
    plt.savefig('DBRE_Summary.png', dpi=300)
    plt.close()

    #prepare to either read next file or stop
    new_number = int(experimentnumber) + 1
    if new_number > num_measurements:
        return 'Done'
    new_filename = filename[:index]
    new_filename = new_filename + str(new_number)
    return DBRE_analyzer(new_filename)  #recursive loop until all files parsed
Пример #30
0
# In[18]:

#计算SPE统计量
X_pca_SPE = Series(np.sum((X - X_pca_recover)**2, axis=1), index=X.index)

# #### 采用置信度确定阈值

# ##### option1:使用scikit learn 的KDE API估计概率密度

# In[19]:

from sklearn.neighbors import KernelDensity

# In[20]:

X_pca_T2_scikit_kde = KernelDensity().fit(X_pca_T2.reshape(
    -1, 1))  #reshape(-1,1)是API要求,否则视为一个点,概率密度就无从谈起了
X_pca_SPE_scikit_kde = KernelDensity().fit(X_pca_SPE.reshape(-1, 1))

# In[21]:

X_pca_T2_sort = X_pca_T2.sort_values()
plt.plot(
    np.exp(X_pca_T2_scikit_kde.score_samples(X_pca_T2_sort.reshape(-1, 1))))

# In[22]:

X_pca_T2_dens_plot = np.linspace(0, 50, 1000)
plt.plot(
    np.exp(X_pca_T2_scikit_kde.score_samples(X_pca_T2_dens_plot.reshape(-1,
                                                                        1))))