예제 #1
0
class LeveOneOutEntropyEstimator(ItEstimator):
    """
    Leave One Out cross-validation entropy estimation from datapoints by
    using kernel estimation of the probability density
    See More:
    Ivanov A. V. and Rozhkova . Properties of the statistical estimate of the
    entropy of a random vector with a probability density
    """

    def __init__(self, kernel,  min_log_proba, bandwith=1.0):
        self.kde = KernelDensity(kernel=kernel, bandwidth=bandwith)
        self.min_log_proba = min_log_proba

    def estimateFromData(self, datapoints):
        entropy = 0.0
        if len(datapoints.shape) == 1:
            datapoints = np.expand_dims(datapoints, 1)
        for i in range(datapoints.shape[0]):
            curr = np.delete(datapoints, i, axis=0)
            self.kde.fit(curr)
            score = self.kde.score(datapoints[None, i, :])
            if score < self.min_log_proba:
                print(score)
                continue

            entropy -= score

        return entropy / datapoints.shape[0]

    def entropy(self, X):
        return self.estimateFromData(X)

    def flags(self):
        return False, False, False
예제 #2
0
def test2(data, col):
    vals = data[col].values
    kdens = KernelDensity(kernel='gaussian', bandwidth=0.5,
                          rtol=1E-2).fit(vals)
    s = kdens.score(vals)
    print("Score:", s)
    return s
예제 #3
0
def estimate_distribution(samples, h=0.1, n_points=100):
	kde = KernelDensity(bandwidth=h)
	samples = samples[:, np.newaxis]
	kde.fit(samples)
	xs = np.linspace(-1.0, 1.0, n_points)
	ys = [np.exp(kde.score([x])) for x in xs]
	return xs, ys
예제 #4
0
def check_if_events_in_cluster(points, events, event_time,
                               n_selection=n_selection_po, multiprocess=True,
                               event_type='po', ):
    #pylint: disable=redefined-outer-name
    '''check if a list of events are in the 4D cluster.'''
    output = {'event_number': [], 'run_number': [], 'in_veto_volume': [], }
    data_arr_nowall = remove_wall_points_np(data_arr_from_points(points))
    #print(data_arr_nowall.shape)
    if not data_arr_nowall.shape[0]:
        warn.warn('No points left in cluster after removing wall points',
                  RuntimeWarning)
        for row in events.iterrows():
            output['event_number'].append(row[1].event_number)
            output['run_number'].append(row[1].run_number)
            output['in_veto_volume'].append(False)
        return output
    if events.empty:
        return output
    data_arr_scores = kde_likelihood(data_arr_nowall,
                                     multiprocess=multiprocess,
                                     event_type=event_type)
    data_arr_selected = data_arr_scores[-len(data_arr_scores)//n_selection:]
    db = DBSCAN(eps=DBSCAN_radius,
                min_samples=DBSCAN_samples)\
                .fit(pd.DataFrame(data_arr_selected).values[:, :4])
    data_arr_cluster = np.zeros(data_arr_selected.shape,
                                dtype=[('x', np.double),
                                       ('y', np.double),
                                       ('z', np.double),
                                       ('t', np.double),
                                       ('score', np.double),
                                       ('label', int)])
    data_arr_cluster['x'] = data_arr_selected['x']
    data_arr_cluster['y'] = data_arr_selected['y']
    data_arr_cluster['z'] = data_arr_selected['z']
    data_arr_cluster['t'] = data_arr_selected['t']
    data_arr_cluster['score'] = data_arr_selected['score']
    data_arr_cluster['label'] = db.labels_
    data_arr_df = pd.DataFrame(data_arr_cluster)
    data_wo_outliers = data_arr_df.query('label != -1').values[:, :4]
    selected_fit = KernelDensity(kernel='tophat', rtol=kde_rtol,
                                 bandwidth=kernel_radius).fit(data_wo_outliers)
    for row in events.iterrows():
        t = abs(row[1].event_time - event_time)/(2*timestep)
        score = selected_fit.score([[row[1].x_3d_nn,
                                     row[1].y_3d_nn,
                                     row[1].z_3d_nn,
                                     t]])
        output['event_number'].append(row[1].event_number)
        output['run_number'].append(row[1].run_number)
        output['in_veto_volume'].append(not score == -np.inf)
    return output
예제 #5
0
def calc_score(data, cols):
    vals = data[list(cols)].values

    # Calculate best bandwith for KDE
    params = {'bandwidth': np.logspace(-2, 5, 20)}
    grid = GridSearchCV(KernelDensity(kernel='gaussian', rtol=1E-6),
                        params,
                        cv=2)
    grid.fit(vals)

    kdens = KernelDensity(kernel='gaussian',
                          bandwidth=grid.best_estimator_.bandwidth,
                          rtol=1E-6).fit(vals)  #grid.best_estimator_.bandwidth
    return kdens.score(vals)
예제 #6
0
class KDEntropyEstimator(ItEstimator):
    discrete = False

    def __init__(self,
                 kernel="gaussian",
                 min_log_proba=-500,
                 bandwith=1.0,
                 kfold=10):
        self.kde = KernelDensity(kernel=kernel, bandwidth=bandwith)
        self.min_log_proba = min_log_proba
        self.kfold = kfold

    def estimateFromData(self, datapoints):
        if len(datapoints.shape) == 1:
            datapoints = np.expand_dims(datapoints, 1)

        entropy = 0.0

        n, d = datapoints.shape
        ma = np.ones(n, dtype=np.bool)
        unit = n // self.kfold
        rem = n % self.kfold

        start = 0
        end = unit + rem
        for i in range(self.kfold):
            sel = np.arange(start, end)
            ma[start:end] = False
            curr = datapoints[ma, :]

            self.kde.fit(curr)
            score = self.kde.score(datapoints[sel, :])

            ma[:] = True
            start = end
            end = min(unit + end, n)

            if score < self.min_log_proba:
                continue

            entropy -= score

        return entropy / n

    def entropy(self, X):
        np.random.seed(0)
        return self.estimateFromData(X)

    def flags(self):
        return False, False, False
    def test(self, member_id, potential_events, info_repo, simscores):
        ## input : member_id, list of potential events
        ## output : PDE scores
        events_info = info_repo["events_info"]
        member_events = np.array(self.training_vecs[member_id])
        #print "member id : ", member_id
        #print "events : ", member_events

        # Found no past history for this user, return.
        # Sorry, can't help without history as no data to fit distribution.
        if len(member_events) == 0:
            return

        kde = KernelDensity(kernel='gaussian').fit(member_events)
        similarity_scores = []
        for event_id in potential_events:
            lat = events_info[event_id]["lat"]
            lon = events_info[event_id]["lon"]
            # similarity_scores.append(np.exp(kde.score([np.array([lat, lon]).T])))
            similarity_scores.append(simscores[member_id][event_id])
            simscores[member_id][event_id] = np.exp(
                kde.score([np.array([lat, lon]).T]))
def test(models, device):
    test_dataset = datasets.MNIST(config.dataset_dir,
                                  train=False,
                                  transform=transforms.ToTensor())
    test_loader = DataLoader(test_dataset,
                             batch_size=config.batch_size,
                             num_workers=config.num_workers)

    X_data = np.zeros((10000, 784), dtype=np.float32)
    X_generated = np.zeros((10000, 784), dtype=np.float32)
    with torch.no_grad():
        for i, (data, _) in enumerate(test_loader):
            noise = torch.rand(
                (data.size(0), config.noise_features), device=device) * 2 - 1
            generated = models.gen(noise)

            start = i * config.batch_size
            end = min((i + 1) * config.batch_size, 10000)
            X_data[start:end] = data.view(-1, 784).numpy()
            X_generated[start:end] = generated.cpu().numpy()

    print("Calculating the score...")
    kde = KernelDensity(bandwidth=0.2).fit(X_generated)
    print("Score: {:.4f}".format(kde.score(X_data) / 10000))
예제 #9
0
from os.path import expanduser
home = expanduser("~")

from sklearn.neighbors.kde import KernelDensity




#RASH
L = 166
msa_file = home + '/Documents/Protein_data/RASH/RASH_HUMAN2_833a6535-26d0-4c47-8463-7970dae27a32_evfold_result/alignment/RASH_HUMAN2_RASH_HUMAN2_jackhmmer_e-10_m30_complete_run.fa'
msa, n_aa = tools.convert_msa(L, msa_file)
print len(msa), len(msa[0]), n_aa



msa_vectors = []
for samp in range(2000):
	msa_vectors.append(np.ndarray.flatten(tools.convert_samp_to_one_hot(msa[samp], n_aa)))


msa_vectors = np.array(msa_vectors)
print msa_vectors.shape

for bw in [.01, .1, 1., 10.]:

	kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(msa_vectors[1000:])
	# density_train = kde.score_samples(msa_vectors)
	print bw, kde.score(msa_vectors[:1000])

class DensityEstimator:
    def __init__(self,
                 training_set,
                 method_name,
                 n_components=None,
                 log_dir=None,
                 second_stage_beta=None):
        self.log_dir = log_dir
        self.training_set = training_set
        self.fitting_done = False
        self.method_name = method_name
        self.second_density_mdl = None
        self.skip_fitting_and_sampling = False
        if method_name == "GMM_Dirichlet":
            self.model = mixture.BayesianGaussianMixture(
                n_components=n_components,
                covariance_type='full',
                weight_concentration_prior=1.0 / n_components)
        elif method_name == "GMM":
            self.model = mixture.GaussianMixture(n_components=n_components,
                                                 covariance_type='full',
                                                 max_iter=2000,
                                                 verbose=2,
                                                 tol=1e-3)
        elif method_name == "GMM_1":
            self.model = mixture.GaussianMixture(n_components=1,
                                                 covariance_type='full',
                                                 max_iter=2000,
                                                 verbose=2,
                                                 tol=1e-3)
        elif method_name == "GMM_10":
            self.model = mixture.GaussianMixture(n_components=10,
                                                 covariance_type='full',
                                                 max_iter=2000,
                                                 verbose=2,
                                                 tol=1e-3)
        elif method_name == "GMM_20":
            self.model = mixture.GaussianMixture(n_components=20,
                                                 covariance_type='full',
                                                 max_iter=2000,
                                                 verbose=2,
                                                 tol=1e-3)
        elif method_name == "GMM_100":
            self.model = mixture.GaussianMixture(n_components=100,
                                                 covariance_type='full',
                                                 max_iter=2000,
                                                 verbose=2,
                                                 tol=1e-3)
        elif method_name == "GMM_200":
            self.model = mixture.GaussianMixture(n_components=200,
                                                 covariance_type='full',
                                                 max_iter=2000,
                                                 verbose=2,
                                                 tol=1e-3)

        elif method_name.find("aux_vae") >= 0:
            have_2nd_density_est = False
            if method_name[8:] != "":
                self.second_density_mdl = method_name[8:]
                have_2nd_density_est = True
            self.model = VaeModelWrapper(
                input_shape=(training_set.shape[-1], ),
                latent_space_dim=training_set.shape[-1],
                have_2nd_density_est=have_2nd_density_est,
                log_dir=self.log_dir,
                sec_stg_beta=second_stage_beta)

        elif method_name == "given_zs":
            files = os.listdir(log_dir)
            for z_smpls in files:
                if z_smpls.endswith('.npy'):
                    break
            self.z_smps = np.load(os.path.join(log_dir, z_smpls))
            self.skip_fitting_and_sampling = True

        elif method_name.upper() == "KDE":
            self.model = KernelDensity(kernel='gaussian', bandwidth=0.425)
            # self.model = KernelDensity(kernel='tophat', bandwidth=15)
        else:
            raise NotImplementedError("Method specified : " +
                                      str(method_name) +
                                      " doesn't have an implementation yet.")

    def fitorload(self, file_name=None):
        if not self.skip_fitting_and_sampling:
            if file_name is None:
                self.model.fit(self.training_set, self.second_density_mdl)
            else:
                self.model.load(file_name)

        self.fitting_done = True

    def score(self, X, y=None):
        if self.method_name.upper().find(
                "AUX_VAE") >= 0 or self.skip_fitting_and_sampling:
            raise NotImplementedError(
                "Log likelihood evaluation for VAE is difficult. or skipped")
        else:
            return self.model.score(X, y)

    def save(self, file_name):
        if not self.skip_fitting_and_sampling:
            if self.method_name.find('vae') >= 0:
                self.model.save(file_name)
            else:
                with open(file_name, 'wb') as f:
                    pickle.dump(self.model, f)

    def reconstruct(self, input_batch):
        if self.method_name.upper().find("AUX_VAE") < 0:
            raise ValueError("Non autoencoder style density estimator: " +
                             self.method_name)
        return self.model.reconstruct(input_batch)

    def get_samples(self, n_samples):
        if not self.skip_fitting_and_sampling:
            if not self.fitting_done:
                self.fitorload()
            scrmb_idx = np.array(range(n_samples))
            np.random.shuffle(scrmb_idx)
            if self.log_dir is not None:
                pickle_path = os.path.join(self.log_dir,
                                           self.method_name + '_mdl.pkl')
                with open(pickle_path, 'wb') as f:
                    pickle.dump(self.model, f)
            if self.method_name.upper() == "GMM_DIRICHLET" or self.method_name.upper() == "AUX_VAE" \
                    or self.method_name.upper() == "GMM" or self.method_name.upper() == "GMM_1" \
                    or self.method_name.upper() == "GMM_10" or self.method_name.upper() == "GMM_20" \
                    or self.method_name.upper() == "GMM_100" or self.method_name.upper() == "GMM_200"\
                    or self.method_name.upper().find("AUX_VAE") >= 0:
                return self.model.sample(n_samples)[0][scrmb_idx, :]
            else:
                return np.random.shuffle(
                    self.model.sample(n_samples))[scrmb_idx, :]
        else:
            return self.z_smps
예제 #11
0
#Splits the training set in train and validation sets
#X_train,X_val,Y_train,Y_val=train_test_split(X,Y,test_size=0.33, shuffle=True,stratify=Y)

#Defines the range of the bandwidth cross validation testing
bandwidth=np.linspace(0.01,1,30)

#Cross Validation with 10 folds
kf = StratifiedKFold(n_splits=10)
folds=10
sc=[]
Vbw=[]
scores=[]
for bw in bandwidth:
    #Needs Completion
     tr_err = va_err = 0
     for tr_ix,va_ix in kf.split(X,Y):
         #Study how to use this function
         kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(X[tr_ix],Y[tr_ix])
         sc.append(kde.score(X[va_ix],Y[va_ix]))
     scores.append(np.sum(sc)/len(sc))
     sc=[]
     Vbw.append(bw)
     
bestBW=Vbw[np.argmax(scores)]
print("Best score->" + str(np.max(scores)) + " with bandwidth= " + str(bestBW))
pClass1=np.log(np.sum(Y_test)/len(Y_test))
pClass0=np.log(1-np.sum(Y_test)/len(Y_test))
kde = KernelDensity(kernel='gaussian', bandwidth=bestBW).fit(X,Y)
eval=kde.score_samples(X_test)
예제 #12
0
def variable_score(variable, parents, data):
    score = 0
    if len(parents) == 0:
        #print(data)
        column = data[variable]
        #print(column)
        #kernel = kde.gaussian_kde(column.values)
        #
        #x = np.linspace(min(column.values), max(column.values), 1000)
        #print(kernel.covariance_factor())
        #plt.plot(x, np.log(kernel(x)))
        #plt.show()
        #sample = kernel.resample(5000)
        #kernel = kde.gaussian_kde(sample)
        #plt.plot(x, kernel(x))
        #plt.show()
        #start = time.time()
        #print(kernel.logpdf(column.values).sum())
        #print("scipy: ", time.time() - start)

        #grid = GridSearchCV(KernelDensity(), {'bandwidth': np.linspace(0.1,1.0,10)}, cv=10)
        #grid.fit(column.values[:, None])
        #print(grid.best_params_)

        vals = column.values[:, np.newaxis]

        #x = np.linspace(min(column.values), max(column.values), 1000)
        #kdens = KernelDensity(kernel='gaussian', bandwidth=1, rtol=0).fit(vals)
        #plt.plot(x, kdens.score_samples(x[:, np.newaxis]))
        #plt.show()

        start = time.time()
        kdens = KernelDensity(kernel='gaussian', bandwidth=0.2,
                              rtol=1E-2).fit(vals)
        plt.plot(sorted(vals, reverse=True),
                 kdens.score_samples(sorted(vals, reverse=True)))
        plt.show()
        print(kdens.score(vals))
        print("sklearn: ", time.time() - start)

        #array = np.unique(data[variable].values)
        #plt.scatter(array, [0] * len(array))
        #plt.plot(np.linspace(min(array), max(array), 1000), kernel(np.linspace(min(array), max(array), 1000)) )
        #plt.show()

        #start = time.time()
        #print(column.apply(event_score, args=(kernel,)).sum())
        #print("apply: ", time.time() - start)

        #start = time.time()
        #density = sm.nonparametric.KDEMultivariate(data=[column], var_type='c')
        #print(len(column.values), len(np.unique(column.values)))
        #print(np.log(density.pdf(column.values)).sum())
        #print("statsmodels: ", time.time() -  start)
    else:
        cols = parents + [variable]
        d = data[cols]
        #print(d)
        #print(d.values)
        samp = KernelDensity(kernel='gaussian', bandwidth=0.2,
                             rtol=1E-8).fit(d.values).sample(5000)
        score1 = KernelDensity(kernel='gaussian', bandwidth=0.2,
                               rtol=1E-8).fit(samp).score(d.values)
        samp = KernelDensity(kernel='gaussian', bandwidth=0.2,
                             rtol=1E-8).fit(data[parents].values).sample(5000)
        score2 = KernelDensity(kernel='gaussian', bandwidth=0.2,
                               rtol=1E-8).fit(samp).score(data[parents].values)
        print(variable, parents, score1, score2, score1 - score2)
        return score1 - score2
        #print(KernelDensity(bandwidth=0.2).fit([np.linspace(-5,5, 100)]).score_samples([np.linspace(-5,5, 100)]))
        #plt.plot(np.linspace(-5, 5, 100), KernelDensity(bandwidth=0.2).fit([np.linspace(-5,5, 100)]).score_samples([np.linspace(-5,5, 100)]))
        #plt.show()
    return score
예제 #13
0
    return (metric_arr - metric_means) / metric_stds


# def delta_standardize(metric_arr):
# 	return (metric_arr - delta_means) / delta_stds

# def combined_standardize(combined_arr):
# 	return (combined_arr - combined_means) / combined_stds

kde_dict = {}
for task in training_data.keys():
    task_str = str(np.array(eval(task)))
    task_data = training_data[str(task)]
    task_data = metric_standardize(task_data)
    task_kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(task_data)
    task_obs_fun = lambda x: np.exp(task_kde.score(x))
    kde_dict[task_str] = task_kde  #task_obs_fun


def transition_fn(initial_state, resulting_state, action):
    if np.array_equal(initial_state, resulting_state):
        return 1 - 0.5
    else:
        return 0.5 / 9


# def deltas_transition_fn(initial_state, resulting_state, action):
# 	means, cov = deltas_emissions_probs[str(list(resulting_state))]
# 	return multivariate_normal.pdf(action, mean=means, cov=cov)

# def observation_fn(observation, task, dx = 0.01):
예제 #14
0
currentdir = os.path.dirname(
    os.path.abspath(inspect.getfile(inspect.currentframe())))
sys.path.insert(0, '../tools')
import protein_model_tools as tools

from os.path import expanduser
home = expanduser("~")

from sklearn.neighbors.kde import KernelDensity

#RASH
L = 166
msa_file = home + '/Documents/Protein_data/RASH/RASH_HUMAN2_833a6535-26d0-4c47-8463-7970dae27a32_evfold_result/alignment/RASH_HUMAN2_RASH_HUMAN2_jackhmmer_e-10_m30_complete_run.fa'
msa, n_aa = tools.convert_msa(L, msa_file)
print len(msa), len(msa[0]), n_aa

msa_vectors = []
for samp in range(2000):
    msa_vectors.append(
        np.ndarray.flatten(tools.convert_samp_to_one_hot(msa[samp], n_aa)))

msa_vectors = np.array(msa_vectors)
print msa_vectors.shape

for bw in [.01, .1, 1., 10.]:

    kde = KernelDensity(kernel='gaussian',
                        bandwidth=bw).fit(msa_vectors[1000:])
    # density_train = kde.score_samples(msa_vectors)
    print bw, kde.score(msa_vectors[:1000])
print('Re-shaped layer_output_from_test', layer_output_from_test.shape)
neuron_number = layer_output_from_train.size / layer_output_from_train.shape[0]
layer_output_from_train = np.reshape(
    layer_output_from_train,
    (layer_output_from_train.shape[0], int(neuron_number)))
print('Re-shaped layer_output_from_train', layer_output_from_train.shape)

## KDE
#reference https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html#sklearn.neighbors.KernelDensity.score
#reference https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gaussian_kde.html#scipy.stats.gaussian_kde
scotts_factor = layer_output_from_train[0:number_samples - 1].size**(
    -1. / (layer_output_from_train.ndim + 4))
print('Bandwidth scotts_factor:', scotts_factor)
kde = KernelDensity(kernel='gaussian', bandwidth=scotts_factor).fit(
    layer_output_from_train[0:number_samples - 1])
kde_score = kde.score(
    layer_output_from_train[number_samples - 1:]) / neuron_number
print('KDE score:', kde_score)
#kde_sample=kde.score_samples(layer_output_from_train)
#print('KDE score samples:', kde_sample)
#kde.score_samples(layer_output_from_train)

#kernel = stats.gaussian_kde(layer_output_from_train[0:number_samples-1])
#print('kernel:',kernel.evaluate(layer_output_from_train[0:number_samples-1]))

## LSA
LSA = -kde_score
#LSA=-kde_sample
print('LSA_sklearn:', LSA)
#print('LSA_scipy:', stats.gaussian_kde(layer_output_from_train))

##
# The [`score_samples(X)`](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html#sklearn.neighbors.KernelDensity.score_samples) method can be used to evaluate the density on sample data (i.e., the likelihood of each observation).

# In[23]:


kde.score_samples(draws.T)


# For instance, based on the means $[0.45, 0.5, 0.55]$, the sample $[10, -10, 0]$ should be *very* unlikely, while $[0.4, 0.5, 0.6]$ will be *more* likely.
# And the vector of empirical means is a very likely observation as well.

# In[24]:


kde.score(np.array([10, -10, 0]).reshape(1, -1))
kde.score(np.array([0.4, 0.5, 0.6]).reshape(1, -1))
kde.score(empirical_means.reshape(1, -1))


# ----
# ## Using the prediction to decide the next arm to sample
# 
# Now that we have a model of [Kernel Density](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html) estimation, we can use it to *generate some random samples*.

# In[25]:


get_ipython().run_line_magic('pinfo', 'kde.sample')

예제 #17
0
print msa_vectors.shape

#PCA
pca = PCA(n_components=20)
pca.fit(msa_vectors[1000:])
a_samps_pca = pca.transform(msa_vectors[1000:])
b_samps_pca = pca.transform(msa_vectors[:1000])
print a_samps_pca.shape

#KDE
# for bw in [.01, .1, 1., 10.]:
for bw in [ 1.]:

	kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(a_samps_pca)
	# density_train = kde.score_samples(msa_vectors)
	print bw, kde.score(b_samps_pca)

densities = kde.score_samples(b_samps_pca)
# densities = np.ones(1000)

#Scale densities to betw 0 and 1
min_density = np.min(densities)
densities = densities - min_density + 1.

weights = np.reciprocal(densities)

max_weights = np.max(weights)
weights = weights / max_weights

print np.max(weights)
print np.mean(weights)
예제 #18
0
def check_if_events_in_cluster_scored(data_arr_scores_list, events,
                                      event_time, event_type='po', corrected_likelihood_limit=0,
                                      probability=False, halflife=None
                                      ):
    #pylint: disable=redefined-outer-name
    '''check if a list of events are in the 4D cluster.'''
    output = {'event_number': [], 'run_number': [], 'in_veto_volume': [], }
    if events.empty:
        return (0, output, [], [])
    data_arr_scores = np.concatenate(data_arr_scores_list)
    if not corrected_likelihood_limit:
        if event_type == 'po':
            corrected_likelihood_limit = corrected_likelihood_limit_po
        elif event_type == 'bipo':
            corrected_likelihood_limit = corrected_likelihood_limit_bipo
    data_arr_selected = data_arr_scores[data_arr_scores['score'] >
                                        corrected_likelihood_limit]
    if len(data_arr_selected) == 0:
        return (len(data_arr_selected), output, [])
    db = DBSCAN(eps=DBSCAN_radius,
                min_samples=DBSCAN_samples)\
                .fit(pd.DataFrame(data_arr_selected).values[:, :4])
    data_arr_cluster = np.zeros(data_arr_selected.shape,
                                dtype=[('x', np.double),
                                       ('y', np.double),
                                       ('z', np.double),
                                       ('t', np.double),
                                       ('score', np.double),
                                       ('label', int)])
    data_arr_cluster['x'] = data_arr_selected['x']
    data_arr_cluster['y'] = data_arr_selected['y']
    data_arr_cluster['z'] = data_arr_selected['z']
    data_arr_cluster['t'] = data_arr_selected['t']
    data_arr_cluster['score'] = data_arr_selected['score']
    data_arr_cluster['label'] = db.labels_
    data_arr_df = pd.DataFrame(data_arr_cluster)
    data_wo_outliers = data_arr_df.query('label != -1').values[:, :4]
    if len(data_wo_outliers) == 0:
        return (len(data_arr_selected), output, [])
    selected_fit = KernelDensity(kernel='tophat', rtol=kde_rtol,
                                 bandwidth=kernel_radius).fit(data_wo_outliers)
    for row in events.iterrows():
        t = abs(row[1].event_time - event_time)/(2*timestep)
        score = selected_fit.score([[row[1].x_3d_nn,
                                     row[1].y_3d_nn,
                                     row[1].z_3d_nn,
                                     t]])
        output['event_number'].append(row[1].event_number)
        output['run_number'].append(row[1].run_number)
        output['in_veto_volume'].append(not score == -np.inf)
    if probability:
        times = np.unique(data_wo_outliers[:,3])
        times.sort()
        p_time_list = []
        if len(times):
            for time in times:
                decay_time = halflife/np.log(2)
                real_time = time*2*timestep
                time_right = real_time+timestep/2
                time_left=max(0, real_time-timestep/2)
                p = (stats.expon.cdf(time_right, scale=decay_time) -
                     stats.expon.cdf(time_left, scale=decay_time))
                p_time_list.append(p*len(data_arr_df.query('t == @time and label != -1'))/pointcloud_size)
        # import pdb; pdb.set_trace()
        return (len(data_arr_selected), output, p_time_list)
            
    return (len(data_arr_selected), output, [])
    
예제 #19
0
def find_cluster_attraction(cluster: List, vec: np.ndarray) -> float:
    kernel = KernelDensity(kernel="gaussian").fit(cluster)
    prob = kernel.score(np.asarray(vec).reshape(1, -1))
    return prob
예제 #20
0

# инстанцируем классы KDE для объекта и фона
kde_fg = KernelDensity(kernel='gaussian', bandwidth=1, algorithm='kd_tree', leaf_size=100).fit(points_fg)
kde_bg = KernelDensity(kernel='gaussian', bandwidth=1, algorithm='kd_tree', leaf_size=100).fit(points_bg)

# инициализируем и вычисляем маски
score_kde_fg = np.zeros(img_input.shape[:2]) # заполняем нулями свежие матрицы
score_kde_bg = np.zeros(img_input.shape[:2])
likelihood_fg = np.zeros(img_input.shape[:2])
coodinates = it.product(range(score_kde_fg.shape[0]), range(score_kde_fg.shape[1]))

value = len(tqdm_notebook(coodinates, total=np.prod(score_kde_fg.shape)))

for x, y in tqdm_notebook(coodinates, total=np.prod(score_kde_fg.shape)):
    score_kde_fg[x, y] = np.exp(kde_fg.score(img_input[x, y, :].reshape(1, -1)))
    score_kde_bg[x, y] = np.exp(kde_bg.score(img_input[x, y, :].reshape(1, -1)))
    n = score_kde_fg[x, y] + score_kde_bg[x, y]
    if n == 0:
        n = 1
    likelihood_fg[x, y] = score_kde_fg[x, y]/n

print('Finish!')

# вызываем алгоритм для двух масок
d_fg = dijkstra(xy_fg, likelihood_fg)
d_bg = dijkstra(xy_bg, 1 - likelihood_fg)

print('Finish 2 !')

margin = 1.0
예제 #21
0
파일: p4.py 프로젝트: ankitagarwal/magic
from sklearn.mixture import GMM
import random
import numpy as np
from sklearn.neighbors.kde import KernelDensity
from sklearn.grid_search import GridSearchCV

# Build model to draw sample from
data = np.random.rand(3000,2)
gmm = GMM(n_components=3)
gmm.fit(data)
sample = gmm.sample(1000)

# Get best BW
grid = GridSearchCV(KernelDensity(),
                    {'bandwidth': np.linspace(0.001, 1.0, 30)},
                    cv=20) # 20-fold cross-validation
grid.fit(sample)
print grid.best_params_

# Fit KDE
kde = KernelDensity(kernel='gaussian', bandwidth=0.0699).fit(sample)
print kde.score(sample)

# GMM fit https://github.com/scikit-learn/scikit-learn/issues/7295
components = [2, 3, 5, 10]
for idx, value in enumerate(components):
    g = GMM(n_components=value, random_state=43)
    g.fit(sample)
    print "log likely hood for components = " + str(value) + " is " + str(g.score_samples(sample))