class LeveOneOutEntropyEstimator(ItEstimator): """ Leave One Out cross-validation entropy estimation from datapoints by using kernel estimation of the probability density See More: Ivanov A. V. and Rozhkova . Properties of the statistical estimate of the entropy of a random vector with a probability density """ def __init__(self, kernel, min_log_proba, bandwith=1.0): self.kde = KernelDensity(kernel=kernel, bandwidth=bandwith) self.min_log_proba = min_log_proba def estimateFromData(self, datapoints): entropy = 0.0 if len(datapoints.shape) == 1: datapoints = np.expand_dims(datapoints, 1) for i in range(datapoints.shape[0]): curr = np.delete(datapoints, i, axis=0) self.kde.fit(curr) score = self.kde.score(datapoints[None, i, :]) if score < self.min_log_proba: print(score) continue entropy -= score return entropy / datapoints.shape[0] def entropy(self, X): return self.estimateFromData(X) def flags(self): return False, False, False
def test2(data, col): vals = data[col].values kdens = KernelDensity(kernel='gaussian', bandwidth=0.5, rtol=1E-2).fit(vals) s = kdens.score(vals) print("Score:", s) return s
def estimate_distribution(samples, h=0.1, n_points=100): kde = KernelDensity(bandwidth=h) samples = samples[:, np.newaxis] kde.fit(samples) xs = np.linspace(-1.0, 1.0, n_points) ys = [np.exp(kde.score([x])) for x in xs] return xs, ys
def check_if_events_in_cluster(points, events, event_time, n_selection=n_selection_po, multiprocess=True, event_type='po', ): #pylint: disable=redefined-outer-name '''check if a list of events are in the 4D cluster.''' output = {'event_number': [], 'run_number': [], 'in_veto_volume': [], } data_arr_nowall = remove_wall_points_np(data_arr_from_points(points)) #print(data_arr_nowall.shape) if not data_arr_nowall.shape[0]: warn.warn('No points left in cluster after removing wall points', RuntimeWarning) for row in events.iterrows(): output['event_number'].append(row[1].event_number) output['run_number'].append(row[1].run_number) output['in_veto_volume'].append(False) return output if events.empty: return output data_arr_scores = kde_likelihood(data_arr_nowall, multiprocess=multiprocess, event_type=event_type) data_arr_selected = data_arr_scores[-len(data_arr_scores)//n_selection:] db = DBSCAN(eps=DBSCAN_radius, min_samples=DBSCAN_samples)\ .fit(pd.DataFrame(data_arr_selected).values[:, :4]) data_arr_cluster = np.zeros(data_arr_selected.shape, dtype=[('x', np.double), ('y', np.double), ('z', np.double), ('t', np.double), ('score', np.double), ('label', int)]) data_arr_cluster['x'] = data_arr_selected['x'] data_arr_cluster['y'] = data_arr_selected['y'] data_arr_cluster['z'] = data_arr_selected['z'] data_arr_cluster['t'] = data_arr_selected['t'] data_arr_cluster['score'] = data_arr_selected['score'] data_arr_cluster['label'] = db.labels_ data_arr_df = pd.DataFrame(data_arr_cluster) data_wo_outliers = data_arr_df.query('label != -1').values[:, :4] selected_fit = KernelDensity(kernel='tophat', rtol=kde_rtol, bandwidth=kernel_radius).fit(data_wo_outliers) for row in events.iterrows(): t = abs(row[1].event_time - event_time)/(2*timestep) score = selected_fit.score([[row[1].x_3d_nn, row[1].y_3d_nn, row[1].z_3d_nn, t]]) output['event_number'].append(row[1].event_number) output['run_number'].append(row[1].run_number) output['in_veto_volume'].append(not score == -np.inf) return output
def calc_score(data, cols): vals = data[list(cols)].values # Calculate best bandwith for KDE params = {'bandwidth': np.logspace(-2, 5, 20)} grid = GridSearchCV(KernelDensity(kernel='gaussian', rtol=1E-6), params, cv=2) grid.fit(vals) kdens = KernelDensity(kernel='gaussian', bandwidth=grid.best_estimator_.bandwidth, rtol=1E-6).fit(vals) #grid.best_estimator_.bandwidth return kdens.score(vals)
class KDEntropyEstimator(ItEstimator): discrete = False def __init__(self, kernel="gaussian", min_log_proba=-500, bandwith=1.0, kfold=10): self.kde = KernelDensity(kernel=kernel, bandwidth=bandwith) self.min_log_proba = min_log_proba self.kfold = kfold def estimateFromData(self, datapoints): if len(datapoints.shape) == 1: datapoints = np.expand_dims(datapoints, 1) entropy = 0.0 n, d = datapoints.shape ma = np.ones(n, dtype=np.bool) unit = n // self.kfold rem = n % self.kfold start = 0 end = unit + rem for i in range(self.kfold): sel = np.arange(start, end) ma[start:end] = False curr = datapoints[ma, :] self.kde.fit(curr) score = self.kde.score(datapoints[sel, :]) ma[:] = True start = end end = min(unit + end, n) if score < self.min_log_proba: continue entropy -= score return entropy / n def entropy(self, X): np.random.seed(0) return self.estimateFromData(X) def flags(self): return False, False, False
def test(self, member_id, potential_events, info_repo, simscores): ## input : member_id, list of potential events ## output : PDE scores events_info = info_repo["events_info"] member_events = np.array(self.training_vecs[member_id]) #print "member id : ", member_id #print "events : ", member_events # Found no past history for this user, return. # Sorry, can't help without history as no data to fit distribution. if len(member_events) == 0: return kde = KernelDensity(kernel='gaussian').fit(member_events) similarity_scores = [] for event_id in potential_events: lat = events_info[event_id]["lat"] lon = events_info[event_id]["lon"] # similarity_scores.append(np.exp(kde.score([np.array([lat, lon]).T]))) similarity_scores.append(simscores[member_id][event_id]) simscores[member_id][event_id] = np.exp( kde.score([np.array([lat, lon]).T]))
def test(models, device): test_dataset = datasets.MNIST(config.dataset_dir, train=False, transform=transforms.ToTensor()) test_loader = DataLoader(test_dataset, batch_size=config.batch_size, num_workers=config.num_workers) X_data = np.zeros((10000, 784), dtype=np.float32) X_generated = np.zeros((10000, 784), dtype=np.float32) with torch.no_grad(): for i, (data, _) in enumerate(test_loader): noise = torch.rand( (data.size(0), config.noise_features), device=device) * 2 - 1 generated = models.gen(noise) start = i * config.batch_size end = min((i + 1) * config.batch_size, 10000) X_data[start:end] = data.view(-1, 784).numpy() X_generated[start:end] = generated.cpu().numpy() print("Calculating the score...") kde = KernelDensity(bandwidth=0.2).fit(X_generated) print("Score: {:.4f}".format(kde.score(X_data) / 10000))
from os.path import expanduser home = expanduser("~") from sklearn.neighbors.kde import KernelDensity #RASH L = 166 msa_file = home + '/Documents/Protein_data/RASH/RASH_HUMAN2_833a6535-26d0-4c47-8463-7970dae27a32_evfold_result/alignment/RASH_HUMAN2_RASH_HUMAN2_jackhmmer_e-10_m30_complete_run.fa' msa, n_aa = tools.convert_msa(L, msa_file) print len(msa), len(msa[0]), n_aa msa_vectors = [] for samp in range(2000): msa_vectors.append(np.ndarray.flatten(tools.convert_samp_to_one_hot(msa[samp], n_aa))) msa_vectors = np.array(msa_vectors) print msa_vectors.shape for bw in [.01, .1, 1., 10.]: kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(msa_vectors[1000:]) # density_train = kde.score_samples(msa_vectors) print bw, kde.score(msa_vectors[:1000])
class DensityEstimator: def __init__(self, training_set, method_name, n_components=None, log_dir=None, second_stage_beta=None): self.log_dir = log_dir self.training_set = training_set self.fitting_done = False self.method_name = method_name self.second_density_mdl = None self.skip_fitting_and_sampling = False if method_name == "GMM_Dirichlet": self.model = mixture.BayesianGaussianMixture( n_components=n_components, covariance_type='full', weight_concentration_prior=1.0 / n_components) elif method_name == "GMM": self.model = mixture.GaussianMixture(n_components=n_components, covariance_type='full', max_iter=2000, verbose=2, tol=1e-3) elif method_name == "GMM_1": self.model = mixture.GaussianMixture(n_components=1, covariance_type='full', max_iter=2000, verbose=2, tol=1e-3) elif method_name == "GMM_10": self.model = mixture.GaussianMixture(n_components=10, covariance_type='full', max_iter=2000, verbose=2, tol=1e-3) elif method_name == "GMM_20": self.model = mixture.GaussianMixture(n_components=20, covariance_type='full', max_iter=2000, verbose=2, tol=1e-3) elif method_name == "GMM_100": self.model = mixture.GaussianMixture(n_components=100, covariance_type='full', max_iter=2000, verbose=2, tol=1e-3) elif method_name == "GMM_200": self.model = mixture.GaussianMixture(n_components=200, covariance_type='full', max_iter=2000, verbose=2, tol=1e-3) elif method_name.find("aux_vae") >= 0: have_2nd_density_est = False if method_name[8:] != "": self.second_density_mdl = method_name[8:] have_2nd_density_est = True self.model = VaeModelWrapper( input_shape=(training_set.shape[-1], ), latent_space_dim=training_set.shape[-1], have_2nd_density_est=have_2nd_density_est, log_dir=self.log_dir, sec_stg_beta=second_stage_beta) elif method_name == "given_zs": files = os.listdir(log_dir) for z_smpls in files: if z_smpls.endswith('.npy'): break self.z_smps = np.load(os.path.join(log_dir, z_smpls)) self.skip_fitting_and_sampling = True elif method_name.upper() == "KDE": self.model = KernelDensity(kernel='gaussian', bandwidth=0.425) # self.model = KernelDensity(kernel='tophat', bandwidth=15) else: raise NotImplementedError("Method specified : " + str(method_name) + " doesn't have an implementation yet.") def fitorload(self, file_name=None): if not self.skip_fitting_and_sampling: if file_name is None: self.model.fit(self.training_set, self.second_density_mdl) else: self.model.load(file_name) self.fitting_done = True def score(self, X, y=None): if self.method_name.upper().find( "AUX_VAE") >= 0 or self.skip_fitting_and_sampling: raise NotImplementedError( "Log likelihood evaluation for VAE is difficult. or skipped") else: return self.model.score(X, y) def save(self, file_name): if not self.skip_fitting_and_sampling: if self.method_name.find('vae') >= 0: self.model.save(file_name) else: with open(file_name, 'wb') as f: pickle.dump(self.model, f) def reconstruct(self, input_batch): if self.method_name.upper().find("AUX_VAE") < 0: raise ValueError("Non autoencoder style density estimator: " + self.method_name) return self.model.reconstruct(input_batch) def get_samples(self, n_samples): if not self.skip_fitting_and_sampling: if not self.fitting_done: self.fitorload() scrmb_idx = np.array(range(n_samples)) np.random.shuffle(scrmb_idx) if self.log_dir is not None: pickle_path = os.path.join(self.log_dir, self.method_name + '_mdl.pkl') with open(pickle_path, 'wb') as f: pickle.dump(self.model, f) if self.method_name.upper() == "GMM_DIRICHLET" or self.method_name.upper() == "AUX_VAE" \ or self.method_name.upper() == "GMM" or self.method_name.upper() == "GMM_1" \ or self.method_name.upper() == "GMM_10" or self.method_name.upper() == "GMM_20" \ or self.method_name.upper() == "GMM_100" or self.method_name.upper() == "GMM_200"\ or self.method_name.upper().find("AUX_VAE") >= 0: return self.model.sample(n_samples)[0][scrmb_idx, :] else: return np.random.shuffle( self.model.sample(n_samples))[scrmb_idx, :] else: return self.z_smps
#Splits the training set in train and validation sets #X_train,X_val,Y_train,Y_val=train_test_split(X,Y,test_size=0.33, shuffle=True,stratify=Y) #Defines the range of the bandwidth cross validation testing bandwidth=np.linspace(0.01,1,30) #Cross Validation with 10 folds kf = StratifiedKFold(n_splits=10) folds=10 sc=[] Vbw=[] scores=[] for bw in bandwidth: #Needs Completion tr_err = va_err = 0 for tr_ix,va_ix in kf.split(X,Y): #Study how to use this function kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(X[tr_ix],Y[tr_ix]) sc.append(kde.score(X[va_ix],Y[va_ix])) scores.append(np.sum(sc)/len(sc)) sc=[] Vbw.append(bw) bestBW=Vbw[np.argmax(scores)] print("Best score->" + str(np.max(scores)) + " with bandwidth= " + str(bestBW)) pClass1=np.log(np.sum(Y_test)/len(Y_test)) pClass0=np.log(1-np.sum(Y_test)/len(Y_test)) kde = KernelDensity(kernel='gaussian', bandwidth=bestBW).fit(X,Y) eval=kde.score_samples(X_test)
def variable_score(variable, parents, data): score = 0 if len(parents) == 0: #print(data) column = data[variable] #print(column) #kernel = kde.gaussian_kde(column.values) # #x = np.linspace(min(column.values), max(column.values), 1000) #print(kernel.covariance_factor()) #plt.plot(x, np.log(kernel(x))) #plt.show() #sample = kernel.resample(5000) #kernel = kde.gaussian_kde(sample) #plt.plot(x, kernel(x)) #plt.show() #start = time.time() #print(kernel.logpdf(column.values).sum()) #print("scipy: ", time.time() - start) #grid = GridSearchCV(KernelDensity(), {'bandwidth': np.linspace(0.1,1.0,10)}, cv=10) #grid.fit(column.values[:, None]) #print(grid.best_params_) vals = column.values[:, np.newaxis] #x = np.linspace(min(column.values), max(column.values), 1000) #kdens = KernelDensity(kernel='gaussian', bandwidth=1, rtol=0).fit(vals) #plt.plot(x, kdens.score_samples(x[:, np.newaxis])) #plt.show() start = time.time() kdens = KernelDensity(kernel='gaussian', bandwidth=0.2, rtol=1E-2).fit(vals) plt.plot(sorted(vals, reverse=True), kdens.score_samples(sorted(vals, reverse=True))) plt.show() print(kdens.score(vals)) print("sklearn: ", time.time() - start) #array = np.unique(data[variable].values) #plt.scatter(array, [0] * len(array)) #plt.plot(np.linspace(min(array), max(array), 1000), kernel(np.linspace(min(array), max(array), 1000)) ) #plt.show() #start = time.time() #print(column.apply(event_score, args=(kernel,)).sum()) #print("apply: ", time.time() - start) #start = time.time() #density = sm.nonparametric.KDEMultivariate(data=[column], var_type='c') #print(len(column.values), len(np.unique(column.values))) #print(np.log(density.pdf(column.values)).sum()) #print("statsmodels: ", time.time() - start) else: cols = parents + [variable] d = data[cols] #print(d) #print(d.values) samp = KernelDensity(kernel='gaussian', bandwidth=0.2, rtol=1E-8).fit(d.values).sample(5000) score1 = KernelDensity(kernel='gaussian', bandwidth=0.2, rtol=1E-8).fit(samp).score(d.values) samp = KernelDensity(kernel='gaussian', bandwidth=0.2, rtol=1E-8).fit(data[parents].values).sample(5000) score2 = KernelDensity(kernel='gaussian', bandwidth=0.2, rtol=1E-8).fit(samp).score(data[parents].values) print(variable, parents, score1, score2, score1 - score2) return score1 - score2 #print(KernelDensity(bandwidth=0.2).fit([np.linspace(-5,5, 100)]).score_samples([np.linspace(-5,5, 100)])) #plt.plot(np.linspace(-5, 5, 100), KernelDensity(bandwidth=0.2).fit([np.linspace(-5,5, 100)]).score_samples([np.linspace(-5,5, 100)])) #plt.show() return score
return (metric_arr - metric_means) / metric_stds # def delta_standardize(metric_arr): # return (metric_arr - delta_means) / delta_stds # def combined_standardize(combined_arr): # return (combined_arr - combined_means) / combined_stds kde_dict = {} for task in training_data.keys(): task_str = str(np.array(eval(task))) task_data = training_data[str(task)] task_data = metric_standardize(task_data) task_kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(task_data) task_obs_fun = lambda x: np.exp(task_kde.score(x)) kde_dict[task_str] = task_kde #task_obs_fun def transition_fn(initial_state, resulting_state, action): if np.array_equal(initial_state, resulting_state): return 1 - 0.5 else: return 0.5 / 9 # def deltas_transition_fn(initial_state, resulting_state, action): # means, cov = deltas_emissions_probs[str(list(resulting_state))] # return multivariate_normal.pdf(action, mean=means, cov=cov) # def observation_fn(observation, task, dx = 0.01):
currentdir = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) sys.path.insert(0, '../tools') import protein_model_tools as tools from os.path import expanduser home = expanduser("~") from sklearn.neighbors.kde import KernelDensity #RASH L = 166 msa_file = home + '/Documents/Protein_data/RASH/RASH_HUMAN2_833a6535-26d0-4c47-8463-7970dae27a32_evfold_result/alignment/RASH_HUMAN2_RASH_HUMAN2_jackhmmer_e-10_m30_complete_run.fa' msa, n_aa = tools.convert_msa(L, msa_file) print len(msa), len(msa[0]), n_aa msa_vectors = [] for samp in range(2000): msa_vectors.append( np.ndarray.flatten(tools.convert_samp_to_one_hot(msa[samp], n_aa))) msa_vectors = np.array(msa_vectors) print msa_vectors.shape for bw in [.01, .1, 1., 10.]: kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(msa_vectors[1000:]) # density_train = kde.score_samples(msa_vectors) print bw, kde.score(msa_vectors[:1000])
print('Re-shaped layer_output_from_test', layer_output_from_test.shape) neuron_number = layer_output_from_train.size / layer_output_from_train.shape[0] layer_output_from_train = np.reshape( layer_output_from_train, (layer_output_from_train.shape[0], int(neuron_number))) print('Re-shaped layer_output_from_train', layer_output_from_train.shape) ## KDE #reference https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html#sklearn.neighbors.KernelDensity.score #reference https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gaussian_kde.html#scipy.stats.gaussian_kde scotts_factor = layer_output_from_train[0:number_samples - 1].size**( -1. / (layer_output_from_train.ndim + 4)) print('Bandwidth scotts_factor:', scotts_factor) kde = KernelDensity(kernel='gaussian', bandwidth=scotts_factor).fit( layer_output_from_train[0:number_samples - 1]) kde_score = kde.score( layer_output_from_train[number_samples - 1:]) / neuron_number print('KDE score:', kde_score) #kde_sample=kde.score_samples(layer_output_from_train) #print('KDE score samples:', kde_sample) #kde.score_samples(layer_output_from_train) #kernel = stats.gaussian_kde(layer_output_from_train[0:number_samples-1]) #print('kernel:',kernel.evaluate(layer_output_from_train[0:number_samples-1])) ## LSA LSA = -kde_score #LSA=-kde_sample print('LSA_sklearn:', LSA) #print('LSA_scipy:', stats.gaussian_kde(layer_output_from_train)) ##
# The [`score_samples(X)`](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html#sklearn.neighbors.KernelDensity.score_samples) method can be used to evaluate the density on sample data (i.e., the likelihood of each observation). # In[23]: kde.score_samples(draws.T) # For instance, based on the means $[0.45, 0.5, 0.55]$, the sample $[10, -10, 0]$ should be *very* unlikely, while $[0.4, 0.5, 0.6]$ will be *more* likely. # And the vector of empirical means is a very likely observation as well. # In[24]: kde.score(np.array([10, -10, 0]).reshape(1, -1)) kde.score(np.array([0.4, 0.5, 0.6]).reshape(1, -1)) kde.score(empirical_means.reshape(1, -1)) # ---- # ## Using the prediction to decide the next arm to sample # # Now that we have a model of [Kernel Density](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html) estimation, we can use it to *generate some random samples*. # In[25]: get_ipython().run_line_magic('pinfo', 'kde.sample')
print msa_vectors.shape #PCA pca = PCA(n_components=20) pca.fit(msa_vectors[1000:]) a_samps_pca = pca.transform(msa_vectors[1000:]) b_samps_pca = pca.transform(msa_vectors[:1000]) print a_samps_pca.shape #KDE # for bw in [.01, .1, 1., 10.]: for bw in [ 1.]: kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(a_samps_pca) # density_train = kde.score_samples(msa_vectors) print bw, kde.score(b_samps_pca) densities = kde.score_samples(b_samps_pca) # densities = np.ones(1000) #Scale densities to betw 0 and 1 min_density = np.min(densities) densities = densities - min_density + 1. weights = np.reciprocal(densities) max_weights = np.max(weights) weights = weights / max_weights print np.max(weights) print np.mean(weights)
def check_if_events_in_cluster_scored(data_arr_scores_list, events, event_time, event_type='po', corrected_likelihood_limit=0, probability=False, halflife=None ): #pylint: disable=redefined-outer-name '''check if a list of events are in the 4D cluster.''' output = {'event_number': [], 'run_number': [], 'in_veto_volume': [], } if events.empty: return (0, output, [], []) data_arr_scores = np.concatenate(data_arr_scores_list) if not corrected_likelihood_limit: if event_type == 'po': corrected_likelihood_limit = corrected_likelihood_limit_po elif event_type == 'bipo': corrected_likelihood_limit = corrected_likelihood_limit_bipo data_arr_selected = data_arr_scores[data_arr_scores['score'] > corrected_likelihood_limit] if len(data_arr_selected) == 0: return (len(data_arr_selected), output, []) db = DBSCAN(eps=DBSCAN_radius, min_samples=DBSCAN_samples)\ .fit(pd.DataFrame(data_arr_selected).values[:, :4]) data_arr_cluster = np.zeros(data_arr_selected.shape, dtype=[('x', np.double), ('y', np.double), ('z', np.double), ('t', np.double), ('score', np.double), ('label', int)]) data_arr_cluster['x'] = data_arr_selected['x'] data_arr_cluster['y'] = data_arr_selected['y'] data_arr_cluster['z'] = data_arr_selected['z'] data_arr_cluster['t'] = data_arr_selected['t'] data_arr_cluster['score'] = data_arr_selected['score'] data_arr_cluster['label'] = db.labels_ data_arr_df = pd.DataFrame(data_arr_cluster) data_wo_outliers = data_arr_df.query('label != -1').values[:, :4] if len(data_wo_outliers) == 0: return (len(data_arr_selected), output, []) selected_fit = KernelDensity(kernel='tophat', rtol=kde_rtol, bandwidth=kernel_radius).fit(data_wo_outliers) for row in events.iterrows(): t = abs(row[1].event_time - event_time)/(2*timestep) score = selected_fit.score([[row[1].x_3d_nn, row[1].y_3d_nn, row[1].z_3d_nn, t]]) output['event_number'].append(row[1].event_number) output['run_number'].append(row[1].run_number) output['in_veto_volume'].append(not score == -np.inf) if probability: times = np.unique(data_wo_outliers[:,3]) times.sort() p_time_list = [] if len(times): for time in times: decay_time = halflife/np.log(2) real_time = time*2*timestep time_right = real_time+timestep/2 time_left=max(0, real_time-timestep/2) p = (stats.expon.cdf(time_right, scale=decay_time) - stats.expon.cdf(time_left, scale=decay_time)) p_time_list.append(p*len(data_arr_df.query('t == @time and label != -1'))/pointcloud_size) # import pdb; pdb.set_trace() return (len(data_arr_selected), output, p_time_list) return (len(data_arr_selected), output, [])
def find_cluster_attraction(cluster: List, vec: np.ndarray) -> float: kernel = KernelDensity(kernel="gaussian").fit(cluster) prob = kernel.score(np.asarray(vec).reshape(1, -1)) return prob
# инстанцируем классы KDE для объекта и фона kde_fg = KernelDensity(kernel='gaussian', bandwidth=1, algorithm='kd_tree', leaf_size=100).fit(points_fg) kde_bg = KernelDensity(kernel='gaussian', bandwidth=1, algorithm='kd_tree', leaf_size=100).fit(points_bg) # инициализируем и вычисляем маски score_kde_fg = np.zeros(img_input.shape[:2]) # заполняем нулями свежие матрицы score_kde_bg = np.zeros(img_input.shape[:2]) likelihood_fg = np.zeros(img_input.shape[:2]) coodinates = it.product(range(score_kde_fg.shape[0]), range(score_kde_fg.shape[1])) value = len(tqdm_notebook(coodinates, total=np.prod(score_kde_fg.shape))) for x, y in tqdm_notebook(coodinates, total=np.prod(score_kde_fg.shape)): score_kde_fg[x, y] = np.exp(kde_fg.score(img_input[x, y, :].reshape(1, -1))) score_kde_bg[x, y] = np.exp(kde_bg.score(img_input[x, y, :].reshape(1, -1))) n = score_kde_fg[x, y] + score_kde_bg[x, y] if n == 0: n = 1 likelihood_fg[x, y] = score_kde_fg[x, y]/n print('Finish!') # вызываем алгоритм для двух масок d_fg = dijkstra(xy_fg, likelihood_fg) d_bg = dijkstra(xy_bg, 1 - likelihood_fg) print('Finish 2 !') margin = 1.0
from sklearn.mixture import GMM import random import numpy as np from sklearn.neighbors.kde import KernelDensity from sklearn.grid_search import GridSearchCV # Build model to draw sample from data = np.random.rand(3000,2) gmm = GMM(n_components=3) gmm.fit(data) sample = gmm.sample(1000) # Get best BW grid = GridSearchCV(KernelDensity(), {'bandwidth': np.linspace(0.001, 1.0, 30)}, cv=20) # 20-fold cross-validation grid.fit(sample) print grid.best_params_ # Fit KDE kde = KernelDensity(kernel='gaussian', bandwidth=0.0699).fit(sample) print kde.score(sample) # GMM fit https://github.com/scikit-learn/scikit-learn/issues/7295 components = [2, 3, 5, 10] for idx, value in enumerate(components): g = GMM(n_components=value, random_state=43) g.fit(sample) print "log likely hood for components = " + str(value) + " is " + str(g.score_samples(sample))