def plot_HalfNorm2(): #X = np.random.normal(-2, 1, size=(1000, 1)) ## scipy runtime warning (possibly due to running outdated version) X = np.random.sample(size=(1000, 1)) X[::2] += 4 modeln = GeneralMixtureModel.from_samples(NormalDistribution, 2, X) modelh = GeneralMixtureModel.from_samples(HalfNormalDistribution, 2, X) x = np.arange(-15, 15, 0.1) fig, ax = plt.subplots(figsize=(7, 4)) ax.plot(x, modeln.probability(x), label='Normal Mixture') ax.plot(x, set_y(x, modelh), label='Half Norm Mixture') ax.set_ylabel('Probability', fontsize=10) ax.legend(fontsize=10) plt.savefig('/scratch/chd5n/test.png', bbox_inches='tight') print('plot written to', '/scratch/chd5n/test.png')
def addData(self, data, score): score = score.clip(min=1e-5) self.data = data self.score = score score_normed = self.score / np.linalg.norm(self.score, ord=1) try: model = GeneralMixtureModel.from_samples( MultivariateGaussianDistribution, n_components=self.n_comp, X=self.data, weights=score_normed) self.model = model except: logging.info("catched an exception")
def fit_mixture(self, pos_left, pos_right, weights, n_components=2, tol=1e-4, maxiter=4000, verbose=False): left, right = np.asarray(pos_left), np.asarray(pos_right) weights = np.asarray(weights) debugs = list() if verbose else None centers = (left + right) / 2.0 init_gmm = GeneralMixtureModel.from_samples( MultivariateGaussianDistribution, n_components=n_components, X=centers, weights=weights, stop_threshold=0.01, n_jobs=2) init_mus, init_covs = list(), list() init_comp_ws = np.array(init_gmm.weights) init_comp_ws /= np.sum(init_comp_ws) for i in range(n_components): paras = init_gmm.distributions[i].parameters init_mus.append(np.array(paras[0])) init_covs.append(np.array(paras[1])) init_paras = self._paras_compose_(init_mus, init_covs, list(init_comp_ws)) method = 'Nelder-Mead' res = opt.minimize(self._mixture_optpara, init_paras, args=(left, right, weights, n_components, debugs), method=method, tol=tol, options={ 'maxiter': maxiter, 'disp': verbose }) if verbose: print("Method:{}; Initial parameter: {};".format( method, init_paras)) print("Converged Parameter: {}".format(res.x)) mus, covs, comp_ws = self._paras_decompose_(res.x, n_components) return mus, covs, comp_ws, res.fun
def fit_mixture_model(counts): ''' Code adapted from https://github.com/josephreplogle/guide_calling ''' data = np.log2(counts + 1) reshaped_data = data.reshape(-1, 1) xs = np.linspace(-2, max(data) + 2, 1000) # Re-fit the model until it has converged with both components given non-zero weight # and the Poisson component in the first position with lower mean. while True: model = GeneralMixtureModel.from_samples( [PoissonDistribution, NormalDistribution], 2, reshaped_data) if 0 in model.weights: # One component was eliminated continue elif np.isnan(model.probability(xs)).any(): continue elif model.distributions[0].parameters[0] > model.distributions[ 1].parameters[0]: continue elif model.distributions[0].name != 'PoissonDistribution': continue else: break labels = model.predict(reshaped_data) xs = np.linspace(0, max(data) + 2, 1000) p_second_component = model.predict_proba(xs.reshape(-1, 1))[:, 1] threshold = 2**xs[np.argmax(p_second_component >= 0.5)] return labels, threshold
time_list = [100, 500, 900, 1500] for time in time_list: samples = hiker_paths.get_all_at_time(time) weights = p_filter.weighting_func(log_weights) print(weights) print(samples) # NormalDistribution samples = [[float(item[0]), float(item[1])] for item in samples] test = np.random.multivariate_normal([50, 50], [[1, 0], [0, 1]], 10) print(test) gmm = GeneralMixtureModel.from_samples(MultivariateGaussianDistribution, n_components=4, X=samples, weights=weights) clf = pomegranate_to_scikitlearn(gmm) graph_shape = depth_dict["data"].shape print(graph_shape) # display predicted scores by the model as a contour plot ax = plt.subplot(111) x = np.linspace(0.0, graph_shape[0]) y = np.linspace(0.0, graph_shape[1]) X, Y = np.meshgrid(x, y) XX = np.array([X.ravel(), Y.ravel()]).T
# print("Naive Bayes - Semisupervised Learning Accuracy: {}".format((model_b.predict(x_val) == y_val).mean())) model_c = BayesClassifier.from_samples(MultivariateGaussianDistribution, x_train, y_train, inertia=0.0, pseudocount=0.0, stop_threshold=0.1, max_iterations=100, verbose=True, n_jobs=1) print("Bayes Classifier - Semisupervised Learning Accuracy: {}".format( (model_c.predict(x_val) == y_val).mean())) # general mixture model d0 = GeneralMixtureModel.from_samples(NormalDistribution, 2, x_train[y_train == 0]) d1 = GeneralMixtureModel.from_samples(NormalDistribution, 2, x_train[y_train == 1]) d2 = GeneralMixtureModel.from_samples(NormalDistribution, 2, x_train[y_train == 2]) d3 = GeneralMixtureModel.from_samples(NormalDistribution, 2, x_train[y_train == 3]) d4 = GeneralMixtureModel.from_samples(NormalDistribution, 2, x_train[y_train == 4]) d5 = GeneralMixtureModel.from_samples(NormalDistribution, 2, x_train[y_train == 5]) d6 = GeneralMixtureModel.from_samples(NormalDistribution, 2, x_train[y_train == 6]) d7 = GeneralMixtureModel.from_samples(NormalDistribution, 2, x_train[y_train == 7]) d8 = GeneralMixtureModel.from_samples(NormalDistribution, 2,
def _initDists(self, X, distribution=MultivariateGaussianDistribution): technique = "R_MV-GMM" # mixture of multivariate gaussain distribution if (technique == "GMM"): # gaussian mixture model #// uvgd = NormalDistribution.from_samples(X) #// gmm = GeneralMixtureModel([uvgd.copy() for _ in range(self.nmix)]) gmm = GeneralMixtureModel.from_samples( distributions=[NormalDistribution for _ in range(self.nmix)], X=X) dists = [gmm.copy() for _ in range(self.statesNumber)] elif (technique == "MV-GMM"): # multivariate gaussian mixture model #// mvgd = MultivariateGaussianDistribution.from_samples(X) #// gmm = GeneralMixtureModel([mvgd.copy() for _ in range(self.nmix)]) gmm = GeneralMixtureModel.from_samples(distributions=[ MultivariateGaussianDistribution for _ in range(self.nmix) ], X=X, n_components=3) dists = [gmm.copy() for _ in range(self.statesNumber)] elif (technique == "MVG"): self._initkmeans(X=X, numClasses=self.statesNumber) dists = [ MultivariateGaussianDistribution.from_samples(X=X[y == i]) for i in range(self.statesNumber) ] elif (technique == "R_GMM"): # random gaussian mixture model randNormal = lambda: NormalDistribution(np.random.randint(1, 10), 1 ) randGMM = lambda: GeneralMixtureModel( [randNormal() for _ in range(self.nmix)]) dists = [randGMM() for _ in range(self.statesNumber)] elif (technique == "R_MV-GMM"): # random multivariate gaussian mixture model randGMM = lambda: GeneralMixtureModel( [randMVG() for _ in range(self.nmix)]) dists = [randGMM() for _ in range(self.statesNumber)] return dists #* not completed: #! GMM-HMM-k y = self._initkmeans(X, self.statesNumber) # list(map(print, y)) return [ GeneralMixtureModel.from_samples(distribution, X=X[y == i], n_components=self.nmix) for i in range(self.statesNumber) ] #! Kmeans init if not isinstance(X, BaseGenerator): data_generator = SequenceGenerator(X, None, None) else: data_generator = X initialization_batch_size = len(data_generator) X_ = [] data = data_generator.batches() for i in range(initialization_batch_size): batch = next(data) X_.extend(batch[0]) X_concat = np.concatenate(X_) if X_concat.ndim == 1: X_concat = X_concat.reshape(X_concat.shape[0], 1) n, d = X_concat.shape clf = Kmeans(self.statesNumber, init="kmeans++", n_init=1) # init should be one of clf.fit(X_concat, max_iterations=None, batches_per_epoch=None) y = clf.predict(X_concat) if callable(distribution): if d == 1: dists = [ distribution.from_samples(X_concat[y == i][:, 0]) for i in range(self.statesNumber) ] elif distribution.blank().d > 1: dists = [ distribution.from_samples(X_concat[y == i]) for i in range(self.statesNumber) ] else: print("error") return dists
from pomegranate import ( NaiveBayes, NormalDistribution, UniformDistribution, ExponentialDistribution, GeneralMixtureModel, MultivariateGaussianDistribution, BernoulliDistribution, ) import pandas as pd import numpy as np X = pd.DataFrame({"A": [1, 0, 1, 0, 1], "B": [1, 1, 1, 1, 0]}) x = BernoulliDistribution(0.4) vals = [] [vals.append(x.sample()) for i in range(1000)] model = NaiveBayes([ NormalDistribution(5, 2), UniformDistribution(0, 10), ExponentialDistribution(1.0) ]) model.predict(np.array([[10]])) model = GeneralMixtureModel.from_samples(MultivariateGaussianDistribution, n_components=3, X=X)
def generate_guide_rna_prediction( loom, guide_rnas, nguide_ca='nGuide', nguide_reads_ca='nGuideReads', cell_prediction_summary_ca='CellGuidePrediction', overwrite=False, only_generate_log2=False, ncell_threshold_for_guide=10, nguide_threshold_for_cell=10): """ This approach is inspired by Replogle et a. 2018 (https://doi.org/10.1038/s41587-020-0470-y). However, instead of a Gaussian/Poisson mixture, this routine uses a Poisson/Poisson mixture. This routine uses the pomegranate package (https://github.com/jmschrei/pomegranate). Parameters ---------- loom : LoomConnection A LoomConnection object upon which guide rna predictions will be made guide_rnas : iterable of strings a list or other iterable of the strings, each corresponding to a column attribute of `loom` indicate the raw counts of a given guide RNA over cells nguide_ca : str QC metric, indicating the name of the column attribute to use to indicate the number of predicted guide RNAs for a cell (Default value = 'nGuide') nguide_reads_ca : QC metric, indicating the name of the column attribute to use to indicate the total number of guide RNA reads for a cell(Default value = 'nGuideReads') cell_prediction_summary_ca : str Indicates the name of the column attribute to use to indicate a summary of positively-predicted guide RNAs for a cell(Default value = 'CellGuidePrediction') overwrite : bool If False, will raise exception if requested column attributes have already been written. If True, will overwrite existing column attributes. (Default value = False) only_generate_log2 : bool If true, will generate log2 guide RNA counts, but will not apply any mixture model prediction. (Default value = False) ncell_threshold_for_guide : int Threshold for the number of cells wherein guide should have nonzero counts for mixture model to attempt prediction. (Default value = 10) nguide_threshold_for_cell : int Threshold for the number of guides to be detected in a given cell to attempt to make a prediction for that particular cell. (Default value = 10) Returns ------- """ from panopticon.utilities import import_check exit_code = import_check("pomegranate", 'conda install -c anaconda pomegranate') if exit_code != 0: return import pandas as pd if nguide_reads_ca in loom.ca.keys() and overwrite == False: raise Exception( "{} already in loom.ca.keys(); if intended, set overwrite argument to True" .format(nguide_reads_ca)) guide_rna_dfs = [] for guide_rna in guide_rnas: guide_rna_dfs.append( pd.DataFrame(loom.ca[guide_rna], columns=[guide_rna], copy=True)) guide_rna_dfs = pd.concat(guide_rna_dfs, axis=1) loom.ca[nguide_reads_ca] = guide_rna_dfs.sum(axis=1).values threshold_for_cell_mask = loom.ca[ nguide_reads_ca] >= nguide_threshold_for_cell prediction_ca_names = [] for guide_rna in guide_rnas: if guide_rna not in loom.ca.keys(): raise Exception( "raw_antibody_count_df must be prepared such that columns match column attributes in loom corresponding to raw antibody conjugate counts" ) new_ca_name = guide_rna + '_log2' if new_ca_name in loom.ca.keys() and overwrite == False: raise Exception( "{} already in loom.ca.keys(); rename guide column attribute and re-run, or set overwrite argument to True" .format(new_ca_name)) loom.ca[new_ca_name] = np.log2(loom.ca[guide_rna]) if not only_generate_log2: from pomegranate import GeneralMixtureModel, PoissonDistribution prediction_ca_name = guide_rna + '_prediction' prediction_ca_names.append(prediction_ca_name) if prediction_ca_name in loom.ca.keys() and overwrite == False: raise Exception( "{} already in loom.ca.keys(); rename guide rna column attribute and re-run, or set overwrite argument to True" .format(prediction_ca_name)) if (~np.isfinite(loom.ca[new_ca_name])).sum() > 0: cellmask = np.isfinite(loom.ca[new_ca_name]) if cellmask.sum( ) >= ncell_threshold_for_guide: # have minimum cells for guide model = GeneralMixtureModel.from_samples( [PoissonDistribution, PoissonDistribution], n_components=2, X=loom.ca[new_ca_name][cellmask.nonzero()[0]].reshape( -1, 1)) predictions = [] for val in loom.ca[new_ca_name]: if not np.isfinite(val): predictions.append(np.nan) else: predictions.append( model.predict(np.array(val).reshape(-1, 1))[0]) else: predictions = [0] * loom.shape[1] predictions = np.array(predictions) else: #print(guide_rna, loom.ca[guide_rna].sum()) # print('Warning: pomegrante Poisson/Normal mixture model has predicted a Poisson component with greater log(UMI+1) counts than normal component. This is unusual behavior!') model = GeneralMixtureModel.from_samples( [PoissonDistribution, PoissonDistribution], n_components=2, X=loom.ca[new_ca_name].reshape(-1, 1)) #model.fit(loom.ca[new_ca_name].reshape(-1, 1)) predictions = model.predict(loom.ca[new_ca_name].reshape( -1, 1)) if loom.ca[new_ca_name][np.array(predictions) == 0].mean( ) > loom.ca[new_ca_name][np.array(predictions) == 1].mean(): predictions = 1 - predictions predictions = np.array(predictions) predictions = np.nan_to_num(predictions, nan=0.0) predictions *= threshold_for_cell_mask loom.ca[prediction_ca_name] = predictions guide_prediction_dfs = [] for prediction_ca_name in prediction_ca_names: guide_prediction_dfs.append( pd.DataFrame(loom.ca[prediction_ca_name], columns=[prediction_ca_name], copy=True)) guide_prediction_dfs = pd.concat(guide_prediction_dfs, axis=1) loom.ca[nguide_ca] = guide_prediction_dfs.sum(axis=1).values loom.ca[cell_prediction_summary_ca] = guide_prediction_dfs.apply( lambda x: '+'.join(guide_prediction_dfs.columns[np.where(x == 1)[0]]), axis=1).values