def test_anova(self): "Tests anova" (E, NOx, _, _, _, results) = self.d gas = loess(E, NOx, span=2. / 3.) gas.fit() gas_null = loess(E, NOx, span=1.0) gas_null.fit() gas_anova = loess_anova(gas, gas_null) gas_anova_theo = results[4] npt.assert_almost_equal(gas_anova.dfn, gas_anova_theo[0], 5) npt.assert_almost_equal(gas_anova.dfd, gas_anova_theo[1], 5) npt.assert_almost_equal(gas_anova.F_value, gas_anova_theo[2], 5) npt.assert_almost_equal(gas_anova.Pr_F, gas_anova_theo[3], 5)
def fit_loess(x: List[float], y: List[float], span: float, degree: int) -> object: try: lobj = sl.loess(x, y, span=span, degree=2) lobj.fit() return lobj except ValueError: return None
def test_1dpredict(self): "Basic test 1d - prediction" (E, NOx, gas_fit_E, _, _, results) = self.d gas = loess(E, NOx, span=2. / 3.) gas.fit() prediction = gas.predict(gas_fit_E, stderror=False) npt.assert_almost_equal(prediction.values, results[2], 6)
def test_failures(self): "Tests failures" (E, NOx, gas_fit_E, _, _, _) = self.d gas = loess(E, NOx, span=2. / 3.) # This one should fail (all parametric) gas.model.parametric = True with pytest.raises(ValueError): gas.fit() # This one also (all drop_square) gas.model.drop_square = True with pytest.raises(ValueError): gas.fit() gas.model.degree = 1 with pytest.raises(ValueError): gas.fit() # This one should not (revert to std) gas.model.parametric = False gas.model.drop_square = False gas.model.degree = 2 gas.fit() # Now, for predict ................. prediction = gas.predict(gas_fit_E, stderror=False) # This one should fail (extrapolation & blending) with pytest.raises(ValueError): gas.predict(prediction.values, stderror=False) # But this one should not .......... gas.predict(gas_fit_E, stderror=False)
def plot_loess(x, y, plt_idx): # Sort data by x-coordinate for plotting ind = np.argsort(x) x = x[ind] y = y[ind] l = loess(x, y, surface='direct') l.fit() pred = l.predict(x, stderror=True) conf = pred.confidence(alpha=0.01) lowess = pred.values ll = np.maximum(0, conf.lower) ul = np.minimum(1, conf.upper) plt.subplot(2, 2, plt_idx) plt.plot(x, y, '+') plt.plot(x, lowess) plt.xlim(right=1100) y_margin = subsample_proportion / 20 plt.ylim(bottom=-y_margin, top=subsample_proportion + y_margin) if plt_idx % 2 == 1: plt.ylabel('Transition probability') if plt_idx > 2: plt.xlabel('Distance to object') plt.fill_between(x, ll, ul, alpha=.33)
def loess_curve(da_ts, time_dim='time', season=None, plot=True): from skmisc.loess import loess import matplotlib.pyplot as plt import xarray as xr import numpy as np if season is not None: da_ts = da_ts.sel({time_dim: da_ts[time_dim + '.season'] == season}) x = da_ts.dropna(time_dim)[time_dim].values y = da_ts.dropna(time_dim).values l_obj = loess(x, y) l_obj.fit() pred = l_obj.predict(x, stderror=True) conf = pred.confidence() lowess = np.copy(pred.values) ll = np.copy(conf.lower) ul = np.copy(conf.upper) da_lowess = xr.Dataset() da_lowess['mean'] = xr.DataArray(lowess, dims=[time_dim]) da_lowess['upper'] = xr.DataArray(ul, dims=[time_dim]) da_lowess['lower'] = xr.DataArray(ll, dims=[time_dim]) da_lowess[time_dim] = x if plot: plt.plot(x, y, '+') plt.plot(x, lowess) plt.fill_between(x, ll, ul, alpha=.33) plt.show() return da_lowess
def loessPlot(X,y,scatter=True,res=100,x_min=None,x_max=None,x_plot=None,ci_alpha=0.05,scatter_kws={},line_kws={},fill_kws={},**loess_args,): ''' Plots a loess curve with shaded confidence confidence intervals using scikit-misc loess function (https://has2k1.github.io/scikit-misc/loess.html) -x can be a (n,) or (n,k) ndarray. If x is (n,k), the x-axis of the plot will correspond to the first covariate in the first column, with the other covariates entering as invisible controls. -y must be a (n,) ndarray -res,x_min,x_max set the resolution and domain for sampling the loess prediction. If x_min and x_max are not provided they are set to the min and max of the first dimension of x. -x_plot (optional) overrides res,x_min,x_max and sets the sampling points for the plot directly. Must be a 1-d ndarray. -ci_alpha sets confidence interval alpha parameter (default=0.05) -Additional loess args can be passed as named parameters ''' #Set default arguments for graphic elements scatter_args = {'s':10,'linewidth':0} scatter_args.update(scatter_kws) line_args = {} line_args.update(line_kws) fill_args = {'color':'k','alpha':0.25,'linewidth':0} fill_args.update(fill_kws) #Split off first dimension of X for plotting if len(X.shape) > 1: x = X[:,0] else: x = X X = X[:,np.newaxis] #Sort out plot range and sampling points if x_min is None: x_min = x.min() if x_max is None: x_max = x.max() if x_plot is None: x_plot = np.linspace(x_min,x_max,res) #Compute loess curve and confidence intervals loessObject = loess.loess(X,y,**loess_args) prediction = loessObject.predict(x_plot,True) confidence_intervals = prediction.confidence(alpha=ci_alpha) if ax is None: ax = plt.gca() #Plot if scatter: ax.scatter(x,y,**scatter_args) ax.plot(x_plot,prediction.values,**line_args) ax.fill_between(x_plot,confidence_intervals.upper,confidence_intervals.lower,**fill_args) return ax
def test_1dbasic_alt(self): "Basic test 1d - part #2" (E, NOx, _, _, _, results) = self.d gas_null = loess(E, NOx) gas_null.model.span = 1.0 gas_null.fit() npt.assert_almost_equal(gas_null.outputs.fitted_values, results[1], 6) npt.assert_almost_equal(gas_null.outputs.enp, 3.5, 1) npt.assert_almost_equal(gas_null.outputs.residual_scale, 0.5197, 4)
def test_1dbasic(self): "Basic test 1d" (E, NOx, _, _, _, results) = self.d gas = loess(E, NOx) gas.model.span = 2. / 3. gas.fit() npt.assert_almost_equal(gas.outputs.fitted_values, results[0], 6) npt.assert_almost_equal(gas.outputs.enp, 5.5, 1) npt.assert_almost_equal(gas.outputs.residual_scale, 0.3404, 4)
def localPartialCorr(y1,y2,X,res=100,x_min=None,x_max=None,x_plot=None,ci_alpha=0.05,inner_bw=10,ax=None,line_kws={},fill_kws={},**loess_args): ''' Computes the local partial correlation betwen y1 and y2 given controls X WARNING: Standard errors should not be taking seriously. Need to update code to estimate them more precisely, or at least correct for intermediate smoothing step. ''' #Split off first dimension of X for defining distances if len(X.shape) > 1: x = X[:,0] else: x = X X = X[:,np.newaxis] #Sort out plot range and sampling points if x_min is None: x_min = x.min() if x_max is None: x_max = x.max() if x_plot is None: x_plot = np.linspace(x_min,x_max,res) #Compute local residuals loessObjects = [loess.loess(X,y,**loess_args) for y in (y1,y2)] for loessObject in loessObjects: loessObject.fit() r1,r2 = [loessObject.outputs.fitted_residuals for loessObject in loessObjects] #Compute invariant components of weighted correlation r11 = r1**2 r22 = r2**2 r12 = r1*r2 #Compute local kernal bandwidth for each plot point n_span = int(inner_bw) h = np.zeros(res) for i in range(res): d = np.abs(x - x_plot[i]) h[i] = np.partition(d,n_span)[n_span] #Compute locally weighted correlation for each x_plot point corr = np.zeros(res) for i in range(res): #Construct weight matrix using tri-cube weight function d = np.abs(x - x_plot[i]) w = (1 - (d/h[i])**3)**3 w = np.clip(w,a_min=0,a_max=None) #Compute weighted correlation corr[i] = np.dot(w,r12) / np.sqrt(np.dot(w,r11)*np.dot(w,r22)) return loessPlot(x_plot,corr,x_plot=x_plot,ax=ax,scatter=False,line_kws=line_kws,fill_kws=fill_kws,**loess_args)
def smooth_fit( xs: np.ndarray, ys: np.ndarray, dist_thrs: Optional[float] = None, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """Smooth curve using loess will perform curve fitting using skmisc.loess, points above 'dist_thrs' will be excluded. Parameters: ---------- xs : np.ndarray x values ys : np.ndarray y values dist_thrs : float exclude (x,y) tuples where x > dist_thrs Returns: ------- A tuple with included x and y-values (xs',ys'), as well as fitted y-values (ys_hat) together with associated standard errors. The tuple is on the form (xs',ys',y_hat,std_err) """ srt = np.argsort(xs) xs = xs[srt] ys = ys[srt] if dist_thrs is None: dist_thrs = np.inf keep = np.abs(xs) < dist_thrs xs = xs[keep] ys = ys[keep] # generate loess class object ls = loess( xs, ys, ) # fit loess class to data ls.fit() # predict on data pred = ls.predict(xs, stderror=True) # get predicted values ys_hat = pred.values # get standard error stderr = pred.stderr return (xs, ys, ys_hat, stderr)
def test_2d_pred_nodata(self): "2D prediction - nodata" (x, y, _, _, _) = self.d madeup = loess(x, y) try: madeup.predict(None) except ValueError: pass else: raise AssertionError("The test should have failed")
def test_2dbasic(self): "2D standard" (x, y, results, _, _) = self.d madeup = loess(x, y) madeup.model.span = 0.5 madeup.model.normalize = True madeup.fit() npt.assert_almost_equal(madeup.outputs.fitted_values, results[0], 5) npt.assert_almost_equal(madeup.outputs.enp, 14.9, 1) npt.assert_almost_equal(madeup.outputs.residual_scale, 0.9693, 4)
def test_2d_pred_confinv(self): "2D prediction - confidence" (x, y, results, _, newdata2) = self.d madeup = loess(x, y) madeup.model.span = 0.5 madeup.model.normalize = True prediction = madeup.predict(newdata2, stderror=True) ci = prediction.confidence(alpha=0.01) npt.assert_almost_equal(ci.lower, results[6][::3], 5) npt.assert_almost_equal(ci.fit, results[6][1::3], 5) npt.assert_almost_equal(ci.upper, results[6][2::3], 5)
def test_2d_pred_nostderr(self): "2D prediction - no stderr" (x, y, results, newdata1, _) = self.d madeup = loess(x, y) madeup.model.span = 0.5 madeup.model.normalize = True prediction = madeup.predict(newdata1, stderror=False) npt.assert_almost_equal(prediction.values, results[4], 5) # prediction = madeup.predict(newdata1, stderror=False) npt.assert_almost_equal(prediction.values, results[4], 5)
def test_1dpredict_2(self): "Basic test 1d - new predictions" (E, NOx, _, newdata, _, results) = self.d # gas = loess(E, NOx, span=2./3.) gas = loess(E, NOx) gas.model.span = 2. / 3. prediction = gas.predict(newdata, stderror=True) ci = prediction.confidence(alpha=0.01) npt.assert_almost_equal(ci.lower, results[3][0::3], 6) npt.assert_almost_equal(ci.fit, results[3][1::3], 6) npt.assert_almost_equal(ci.upper, results[3][2::3], 6)
def test_2d_modflags(self): "2D - modification of model flags" (x, y, results, _, _) = self.d madeup = loess(x, y) madeup.model.span = 0.8 madeup.model.drop_square = [True, False] madeup.model.parametric = [True, False] npt.assert_equal(madeup.model.parametric[:2], [1, 0]) madeup.fit() npt.assert_almost_equal(madeup.outputs.fitted_values, results[1], 5) npt.assert_almost_equal(madeup.outputs.enp, 6.9, 1) npt.assert_almost_equal(madeup.outputs.residual_scale, 1.4804, 4)
def test_2d_modfamily(self): "2D - family modification" (x, y, results, _, _) = self.d madeup = loess(x, y) madeup.model.span = 0.8 madeup.model.drop_square = [True, False] madeup.model.parametric = [True, False] madeup.model.family = "symmetric" madeup.fit() npt.assert_almost_equal(madeup.outputs.fitted_values, results[2], 5) npt.assert_almost_equal(madeup.outputs.enp, 6.9, 1) npt.assert_almost_equal(madeup.outputs.residual_scale, 1.0868, 4)
def fit_lowess(y_pred, y_true): l = loess(y_pred, y_true) pred, conf, smlowess, ll, ul = None, None, None, None, None try: l.fit() pred = l.predict(y_pred, stderror=True) conf = pred.confidence() smlowess = pred.values ll = conf.lower ul = conf.upper except ValueError as e: print(e) return pred, conf, smlowess, ll, ul
def feature_select(x, gene_num=2000): '''Select highly variable genes (HVGs) (See [Stuart *et al*, (2019)](https://www.nature.com/articles/nbt.4096) and its early version [preprint](https://www.biorxiv.org/content/10.1101/460147v1.full.pdf) Page 12-13: Data preprocessing - Feature selection for individual datasets). Parameters ---------- x : np.array \([N, G^{raw}]\) The raw count data. gene_num : int, optional The number of genes to retain. Returns ---------- x : np.array \([N, G]\) The count data after gene selection. index : np.array \([G, ]\) The selected index of genes. ''' n, p = x.shape # mean and variance of each gene of the unnormalized data mean, var = np.mean(x, axis=0), np.var(x, axis=0, ddof=1) # model log10(var)~log10(mean) by local fitting of polynomials of degree 2 loess_model = loess.loess(np.log10(mean), np.log10(var), span=0.3, degree=2, family='gaussian') loess_model.fit() fitted = loess_model.outputs.fitted_values # standardized feature z = (x - mean) / np.sqrt(10**fitted) # clipped the standardized features to remove outliers z = np.clip(z, -np.inf, np.sqrt(n)) # the variance of standardized features across all cells represents a measure of # single cell dispersion after controlling for mean expression feature_score = np.sum(z**2, axis=0) / (n - 1) # feature selection index = feature_score.argsort()[::-1][0:gene_num] return x[:, index], index
def test_2d_pred_stderr(self): "2D prediction - w/ stderr" (x, y, results, _, newdata2) = self.d madeup = loess(x, y) madeup.model.span = 0.5 madeup.model.normalize = True prediction = madeup.predict(newdata2, stderror=True) npt.assert_almost_equal(prediction.values, results[5], 5) npt.assert_almost_equal(prediction.stderr, [0.276746, 0.278009], 5) npt.assert_almost_equal(prediction.residual_scale, 0.969302, 6) npt.assert_almost_equal(prediction.df, 81.2319, 4) # Direct access prediction = madeup.predict(newdata2, stderror=True) npt.assert_almost_equal(prediction.values, results[5], 5) npt.assert_almost_equal(prediction.stderr, [0.276746, 0.278009], 5) npt.assert_almost_equal(prediction.residual_scale, 0.969302, 6) npt.assert_almost_equal(prediction.df, 81.2319, 4)
def select_hvf_pegasus(data: AnnData, consider_batch: bool, n_top: int = 2000, span: float = 0.02) -> None: """ Select highly variable features using the pegasus method """ if "robust" not in data.var: raise ValueError("Please run `qc_metrics` to identify robust genes") estimate_feature_statistics(data, consider_batch) robust_idx = data.var["robust"].values hvf_index = np.zeros(robust_idx.sum(), dtype=bool) mean = data.var.loc[robust_idx, "mean"] var = data.var.loc[robust_idx, "var"] lobj = sl.loess(mean, var, span=span, degree=2) lobj.fit() rank1 = np.zeros(hvf_index.size, dtype=int) rank2 = np.zeros(hvf_index.size, dtype=int) delta = var - lobj.outputs.fitted_values fc = var / lobj.outputs.fitted_values rank1[np.argsort(delta)[::-1]] = range(hvf_index.size) rank2[np.argsort(fc)[::-1]] = range(hvf_index.size) hvf_rank = rank1 + rank2 hvf_index[np.argsort(hvf_rank)[:n_top]] = True data.var["hvf_loess"] = 0.0 data.var.loc[robust_idx, "hvf_loess"] = lobj.outputs.fitted_values data.var["hvf_rank"] = -1 data.var.loc[robust_idx, "hvf_rank"] = hvf_rank data.var["highly_variable_features"] = False data.var.loc[robust_idx, "highly_variable_features"] = hvf_index
def EM_initial_guess(data, times, nulls): # Initialize the EM algorithm as indicated in the report. # Returns the estimated sigma, the fitted loess and the uniform probabilities. n_genes, n_times = data.shape sigmas = np.sqrt(np.var(data, axis=1)) fit_loess = np.zeros(data.shape) # For every gene for ix, row in data.iterrows(): # Fit a lowess. model = loess(x=times, y=row) model.fit() fit_loess[ix] = model.predict(newdata=times).values # Set the probabilities p_j uniformly. n_0 = np.sum(np.sum(nulls)) prob = n_0 / (2 * n_times * n_genes) # p is 0.5 * probability to be 0 p = [prob for _ in range(n_genes)] return sigmas, fit_loess, p
def EM_M_step(data, q, times): fit_loess = np.zeros(data.shape) p = np.zeros(data.shape[0]) for ix, row in data.iterrows(): if np.var(row) == 0: pass else: # Update the function f_j for every gene by fitting a weighted loess. model = loess(x=times, y=row, weights=q[ix]) model.fit() fit_loess[ix] = model.predict(times).values # Update the probabilities p_j p[ix] = np.mean(q[ix]) # We know that our function cannot be negative. fit_loess[fit_loess < 0] = 0 return fit_loess, p
def generate_residuals(x, y, bandwidth=0.05): """ This function runs a series of loess regressions for different response variables (y) on a single explanatory variable (x) and computes the corresponding residuals. """ # Turn input data into np.ndarrays. y = np.array(y) x = np.array(x) # Determine number of observations and number of columns for the # outcome variable. n = len(y) col_len = len(y[0]) res = np.zeros([n, col_len]) for i in range(col_len): yfit = loess(x, y[:, i], span=bandwidth, degree=1) yfit.fit() res[:, i] = yfit.outputs.fitted_residuals return res
def loess_data(xs, ys): ixes = range(len(xs)) sorted_xs = [] sorted_ys = [] for ix in sorted(ixes, key=lambda x: xs[x]): sorted_xs.append(xs[ix]) sorted_ys.append(ys[ix]) l = loess(sorted_xs, sorted_ys) l.fit() pred_x = sorted(list(set(sorted_xs))) pred = l.predict(pred_x, stderror=True) conf = pred.confidence() lowess = pred.values ll = conf.lower ul = conf.upper return pred_x, lowess, ll, ul
def DOC_loess(do: list, p_pair: str, p_span: float, p_degree: float, p_family: str, p_iterations: int, p_surface: str): # Subset OL = do[0] DIS = do[1] OL_rows, OL_cols = OL.shape # Overlap values for loess prediction xs = np.linspace(start=0, stop=1, num=1001) # Vectorize if not p_pair: tril = np.tril_indices(OL_rows, k=-1) OL_tri = OL.values[tril] DIS_tri = DIS.values[tril] else: OL_tri = OL.values DIS_tri = DIS.values # # To data frame DF_l = pd.DataFrame({'y': DIS_tri, 'x': OL_tri}) DF_l = DF_l.loc[~DF_l.isna().any(axis=1)] # Lowess LOW = loess(y=DF_l.y, x=DF_l.x, span=p_span, degree=p_degree, family=p_family, iterations=p_iterations, surface=p_surface) xs = [round(x, 3) for x in xs if DF_l.x.min() < x < DF_l.x.max()] LOW_pred = LOW.predict(newdata=xs) LOW_P = pd.DataFrame({"Overlap": xs, "LOWESS": LOW_pred.values}) LOW_P = LOW_P.loc[~LOW_P.isna().any(axis=1)] return LOW_P
def _highly_variable_genes_seurat_v3( adata: AnnData, layer: Optional[str] = None, n_top_genes: int = 2000, batch_key: Optional[str] = None, check_values: bool = True, span: float = 0.3, subset: bool = False, inplace: bool = True, ) -> Optional[pd.DataFrame]: """\ See `highly_variable_genes`. For further implemenation details see https://www.overleaf.com/read/ckptrbgzzzpg Returns ------- Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`) or updates `.var` with the following fields highly_variable : bool boolean indicator of highly-variable genes **means** means per gene **variances** variance per gene **variances_norm** normalized variance per gene, averaged in the case of multiple batches highly_variable_rank : float Rank of the gene according to normalized variance, median rank in the case of multiple batches highly_variable_nbatches : int If batch_key is given, this denotes in how many batches genes are detected as HVG """ try: from skmisc.loess import loess except ImportError: raise ImportError( 'Please install skmisc package via `pip install --user scikit-misc' ) X = adata.layers[layer] if layer is not None else adata.X if check_values and (check_nonnegative_integers(X) == False): warnings.warn( "`flavor='seurat_v3'` expects raw count data, but non-integers were found.", UserWarning, ) if batch_key is None: batch_info = pd.Categorical(np.zeros(adata.shape[0], dtype=int)) else: batch_info = adata.obs[batch_key].values norm_gene_vars = [] for b in np.unique(batch_info): ad = adata[batch_info == b] X = ad.layers[layer] if layer is not None else ad.X mean, var = _get_mean_var(X) not_const = var > 0 estimat_var = np.zeros(adata.shape[1], dtype=np.float64) y = np.log10(var[not_const]) x = np.log10(mean[not_const]) model = loess(x, y, span=span, degree=2) model.fit() estimat_var[not_const] = model.outputs.fitted_values reg_std = np.sqrt(10**estimat_var) batch_counts = X.astype(np.float64).copy() # clip large values as in Seurat N = np.sum(batch_info == b) vmax = np.sqrt(N) clip_val = reg_std * vmax + mean if sp_sparse.issparse(batch_counts): batch_counts = sp_sparse.csr_matrix(batch_counts) mask = batch_counts.data > clip_val[batch_counts.indices] batch_counts.data[mask] = clip_val[batch_counts.indices[mask]] else: clip_val_broad = np.broadcast_to(clip_val, batch_counts.shape) np.putmask( batch_counts, batch_counts > clip_val_broad, clip_val_broad, ) if sp_sparse.issparse(batch_counts): squared_batch_counts_sum = np.array( batch_counts.power(2).sum(axis=0)) batch_counts_sum = np.array(batch_counts.sum(axis=0)) else: squared_batch_counts_sum = np.square(batch_counts).sum(axis=0) batch_counts_sum = batch_counts.sum(axis=0) norm_gene_var = (1 / ((N - 1) * np.square(reg_std))) * ( (N * np.square(mean)) + squared_batch_counts_sum - 2 * batch_counts_sum * mean) norm_gene_vars.append(norm_gene_var.reshape(1, -1)) norm_gene_vars = np.concatenate(norm_gene_vars, axis=0) # argsort twice gives ranks, small rank means most variable ranked_norm_gene_vars = np.argsort(np.argsort(-norm_gene_vars, axis=1), axis=1) # this is done in SelectIntegrationFeatures() in Seurat v3 ranked_norm_gene_vars = ranked_norm_gene_vars.astype(np.float32) num_batches_high_var = np.sum( (ranked_norm_gene_vars < n_top_genes).astype(int), axis=0) ranked_norm_gene_vars[ranked_norm_gene_vars >= n_top_genes] = np.nan ma_ranked = np.ma.masked_invalid(ranked_norm_gene_vars) median_ranked = np.ma.median(ma_ranked, axis=0).filled(np.nan) df = pd.DataFrame(index=np.array(adata.var_names)) df['highly_variable_nbatches'] = num_batches_high_var df['highly_variable_rank'] = median_ranked df['variances_norm'] = np.mean(norm_gene_vars, axis=0) df['means'] = mean df['variances'] = var df.sort_values( ['highly_variable_rank', 'highly_variable_nbatches'], ascending=[True, False], na_position='last', inplace=True, ) df['highly_variable'] = False df.loc[:int(n_top_genes), 'highly_variable'] = True df = df.loc[adata.var_names] if inplace or subset: adata.uns['hvg'] = {'flavor': 'seurat_v3'} logg.hint('added\n' ' \'highly_variable\', boolean vector (adata.var)\n' ' \'highly_variable_rank\', float vector (adata.var)\n' ' \'means\', float vector (adata.var)\n' ' \'variances\', float vector (adata.var)\n' ' \'variances_norm\', float vector (adata.var)') adata.var['highly_variable'] = df['highly_variable'].values adata.var['highly_variable_rank'] = df['highly_variable_rank'].values adata.var['means'] = df['means'].values adata.var['variances'] = df['variances'].values adata.var['variances_norm'] = df['variances_norm'].values.astype( 'float64', copy=False) if batch_key is not None: adata.var['highly_variable_nbatches'] = df[ 'highly_variable_nbatches'].values if subset: adata._inplace_subset_var(df['highly_variable'].values) else: if batch_key is None: df = df.drop(['highly_variable_nbatches'], axis=1) return df
def get_significant_ev(ev1, ev2, ids, cond1, cond2, norm='loess', plot='all', mid_point=0., EV_range=(-1.4, 1.4), confidence=0.95, alpha=0.75, kernel_density=200): """ Compare two eigenvectors (from two conditions), using as null, or background, model the differences between each pair of neighbor bins. EigenVectors are Z-score normalized and their difference is LOESS normalized :param ev1: list of values in eigenvectors from first condition :param ev2: list of values in eigenvectors from second condition :param ids: list of names (e.g. [('chr1', 1294), ...]) corresponding to values in each eigenvector :param cond1: name of first experiment (corresponding to first eigenvector) :param cond2: name of second experiment (corresponding to second eigenvector) :param loess norm: normalization to perform on the difference of eigenvectors (options are loess or none) :param 0.75 alpha: smoothing parameter for LOESS normalization (0 pass through all points, 1, straight line) :param 200 kernel_density: density of the matrix for Gaussian kernel density estimation :param all plot: plots to be generate. If 'all' 6 plots will be generated three for differences between conditions (before, after LOESS normalization, and density map), the same two for null mode. If 'final' only a single plot with density maps of observed data and null model. If 'none', no plot will be generated. :param 0.95 confidence: confidence level for definition of bins with significantly different compartments :returns: a dictionary with, as keys, the input ids, and as values, a tuple with two value: 1- the probabilities (or Cumulative Densities) of each compared pair of eigenvector values inside the Gaussian kernel density of the null model, and 2- the LOESS normalized difference between eigenvectors values """ print 'Getting EigenVectors' ev1 = np.array(ev1) ev2 = np.array(ev2) ids = np.array(ids) # ~normalize # ev1 = ev1 / np.std(ev1) / 3 # ev2 = ev2 / np.std(ev2) / 3 # plot axes = [] if plot == 'all': _ = plt.figure(figsize=(18, 18)) elif plot != 'none': _ = plt.figure(figsize=(10, 10)) axe = plt.subplot(111) if plot in ['all', 'correlation']: if plot == 'all': axe = plt.subplot(2, 2, 1) axes.append(axe) axe = compare_AB(ev1, ev2, axe=axe, xlabel='Eigenvector of ' + cond1, ylabel='Eigenvector of ' + cond2, color_ab=True, mid_point=mid_point, EV_range=EV_range) axe.set_title('Correlation between EigenVectors (%s vs %s)' % ( cond1, cond2)) # Definition of null model print 'Defining Null model' ev3 = [] ev4 = [] for pos in xrange(0, len(ev1) - 1, 2): # we want same chromosome and true neighbors if (ids[pos][0] == ids[pos + 1][0] and ids[pos + 1][1] - ids[pos][1] == 1): ev3.append(ev1[pos]) ev3.append(ev2[pos]) ev4.append(ev1[pos + 1]) ev4.append(ev2[pos + 1]) if plot == 'all': axes.append(plt.subplot(2, 2, 2)) axe = compare_AB( ev3, ev4, axe=axes[-1], xlabel='Eigenvector from %s and %s ($n$)' % (cond1, cond2), ylabel='Eigenvector from %s and %s ($n+1$)' % (cond1, cond2), color_ab=True, mid_point=mid_point, EV_range=EV_range) axe.set_title(( 'Correlation of EigenVectors (Null model)\n' '{0} bins $n$ vs $n+1$ and {1} ' 'bins $n$ vs $n+1$').format(cond1, cond2)) ########################################################################## # Normalization # Z-scorish zev1 = ev1 # (ev1 - np.mean(ev1)) / np.std(ev1) / 3 zev2 = ev2 # (ev2 - np.mean(ev2)) / np.std(ev2) / 3 # prepare data for MA plot x = (zev1 + zev2) / 2 y = (zev1 - zev2) # sort idx = np.argsort(x) x = x[idx] y = y[idx] ids = ids[idx] # for null model: zev3 = np.array(ev3) zev4 = np.array(ev4) x_cor = (zev3 + zev4) / 2 y_cor = (zev3 - zev4) idx_cor = np.argsort(x_cor) x_cor = x_cor[idx_cor] y_cor = y_cor[idx_cor] # LOESS if norm == 'loess': print 'Computing LOESS on observed data' lo = loess(x, y, span=alpha, weight=None) lo.fit() pred = lo.outputs.fitted_values.copy() df = lo.outputs.enp else: pred = np.zeros(len(y)) df = 0 # LOESS on Null model if norm == 'loess': print 'Computing LOESS on Null model' lo = loess(x_cor, y_cor, span=alpha, weight=None) lo.fit() pred_cor = lo.outputs.fitted_values.copy() df = lo.outputs.enp else: pred_cor = np.zeros(len(y_cor)) df = 0 ########################################################################## # ordinary least square regression print 'Perform OLS regression and outlier test' # for real data modelR = OLS(y - pred, x, ) resultR = modelR.fit() # for null model modelN = OLS(y_cor - pred_cor, x_cor) resultN = modelN.fit() sigmaN = np.sqrt(resultN.mse_resid) inflR = resultR.get_influence hiiR = inflR().hat_matrix_diag # model leverage sigmaR = np.sqrt(resultR.mse_resid) residR = resultR.resid / sigmaN / np.sqrt(1 - hiiR) dfR = modelR.df_resid - 1 unadj_pR = st.t.sf(np.abs(residR), dfR) * 2 adj_pR = multipletests(unadj_pR, alpha=0.05, method='bonferroni')[1] if plot in ['all', 'difference']: if plot == 'all': axe = plt.subplot(2, 2, 3) axe.set_title(('Bland-Altman plot of EigenVectors (%s vs %s)\n' 'with prediction bands based on null model') % ( cond1, cond2)) axes.append(nice_ba_plot(x, y, unadj_pR, sigmaN, sigmaR, pred, cond1, cond2, alpha=alpha, ax=axe)) ########################################################################## print 'Perform the kernel density estimate for null model' y -= pred y_cor -= pred_cor # Perform the kernel density estimate for null model xmin = min(x_cor) - abs(min(x_cor)) * .5 ymin = min(y_cor) - abs(min(y_cor)) * .5 xmax = max(x_cor) * 1.5 ymax = max(y_cor) * 1.5 xx, yy = np.mgrid[xmin:xmax:complex(0, kernel_density), ymin:ymax:complex(0, kernel_density)] positions = np.vstack([xx.ravel(), yy.ravel()]) z_cor = np.vstack([x_cor, y_cor]) kernel_cor = st.gaussian_kde(z_cor) f_cor = np.reshape(kernel_cor(positions).T, xx.shape) f_cor_sum = f_cor.sum() f_cor /= f_cor_sum print 'Perform the kernel density estimate for comparison' # Perform the kernel density estimate for comparison z = np.vstack([x, y]) kernel = st.gaussian_kde(z) f = np.reshape(kernel(positions).T, xx.shape) f /= f.sum() # define probability lines n = 10000 t = np.linspace(0, f_cor.max(), n) # kernel probability for null model integral = ((f_cor >= t[:, None, None]) * f_cor).sum(axis=(1, 2)) # function to get kernel density at a given CDF ff = interpolate.interp1d(integral, t) steps = [0.99, 0.95, 0.9, 0.75, 0.50, 0.25] t_contours = ff(np.array(steps)) # function to get CDF at a given kernel density invff = interpolate.interp1d(t, integral) # significant bins in Null model cut = confidence # significant bins observed data print 'Computing significant changes in observed data' signx = [] signy = [] result = {} # get kernel density for each pair of eigenvectors pvals = kernel_cor.pdf((x, y)) / f_cor_sum ev1 = ev1[idx] ev2 = ev2[idx] for i, pv in enumerate(pvals): try: pv = invff(pv) # convert PDF to CDF except ValueError: try: pv = 1. except ValueError: pv = 0. continue if pv > cut: signx.append(x[i]) signy.append(y[i]) result[ids[i][0], ids[i][1]] = (ev1[i], ev2[i], pv, y[i], unadj_pR[i], adj_pR[i]) if plot in ['all', 'density']: if plot == 'all': axe = plt.subplot(2, 2, 4) plt.title(('LOESS normalized BA density plot of EigenVectors\n' '({0} vs {1} plotted over null model)').format( cond1, cond2)) axes.append(nice_contour_plot( xx, yy, f, f_cor, cond1, cond2, ax=axe, total_len=len(x), cut=cut, signx=signx, signy=signy, t_contours=t_contours, steps=steps)) if plot in ['all', 'correlation']: for axe in axes[:2]: axe.set_xlim(EV_range) axe.set_ylim(EV_range) if plot in ['all', 'density', 'difference']: xlim = (min(x.min(), x_cor.min()), max(x.max(), x_cor.max())) ylim = (min(y.min(), y_cor.min()) * 1.15, max(y.max(), y_cor.max()) * 1.15) for axe in (axes[2:] if plot == 'all' else axes): axe.set_xlim(xlim) axe.set_ylim(ylim) return result
output = runByCaseSmooth(case, maf, genometot, data, span, IDs, nathres, offby) return case, output def runByCaseSmooth(case, maf, genometot, data, span, IDs, nathres=0.3, offby=3): start_time = time.time() model = loess.loess(data['starts'], data['counts'], span=span, surface='direct') model.fit() stored_all = { 'mutdiff': [], 'position': [], 'mutrate': [], 'mutrate_noadj': [], 'patient': [] } use_mean = True these = getMinDistByGenome(maf, case, IDs, offby=offby, use_mean=use_mean) if these.shape[0] == 0: logger.info(
def seqff(self): """ Returns values of seqff, wrsc, enet score supplementary files can be downloaded below: https://obgyn.onlinelibrary.wiley.com/doi/abs/10.1002/pd.4615 :param bininfoData: location of supplementary table2.csv file :type bininfoData: String :param inputData: directory path or file location of inputdata ( ".sam" or ".bam" or ".newtemp") :type inputData: String :param rdata: location of supplementary .rdata file :type rdata: String :param output_lod: where result files are(total 4 directories will be created) :type output_lod: String :return: None """ start = time.time() # load bininfo bininfo = load_bininfo(self.bininfodata_loc) # load input files if os.path.isdir(self.input_loc): input_list = [ self.input_loc + x for x in os.listdir(self.input_loc) ] elif os.path.isfile(self.input_loc): input_list = [self.input_loc] else: raise FileNotFoundError( "error occurred : inputData is not a Directory or File") for i, file in enumerate(input_list): filetype = file.split(".")[-1] # filetype : 'sam' or 'bam' or 'newtemp' if 'sam' in filetype: bincount = load_sam(file) elif 'newtemp' in filetype: bincount = load_counts(file) file = file.replace(".newtemp", "") # TEMP .newtemp -> .bam elif 'bam' in filetype: bincount = load_bam(file) else: continue #CREATE newtemp file in "output_loc"/newtemp/ create_newtemp(bincount, file, self.newtemp_loc) newtemp = pd.DataFrame.from_dict(bincount, orient='index') newtemp.reset_index(level=0, inplace=True) newtemp.rename(columns={ 'index': 'binName', 0: 'counts' }, inplace=True) temp_bininfo = bininfo.copy(deep=True) temp_bininfo = temp_bininfo.merge( newtemp, on='binName', how='left') # missing value : NaN, not NA in pandas temp_bininfo['counts'] = temp_bininfo['counts'].fillna(0) temp_bininfo.sort_values(by='binorder', inplace=True) temp_bininfo.reset_index(drop=True) ####DATA PROCESSING ####################### autosomebinsonly = [] for index in range(61927): boolean = (temp_bininfo['FRS'][index] != 'NA') and \ (float(temp_bininfo['GC'][index]) > 0.316) and \ (temp_bininfo['CHR'][index] != 'chrX') and \ (temp_bininfo['CHR'][index] != 'chrY') autosomebinsonly.append(boolean) autosomebinsonly = pd.Series(autosomebinsonly) alluseablebins = [] for index in range(61927): boolean = (temp_bininfo['FRS'][index] != "NA") and (float( temp_bininfo['GC'][index]) > 0.316) alluseablebins.append(boolean) alluseablebins = pd.Series(alluseablebins) #CREATE alluseablebins file in "output_loc"/alluseablebins #create_alluseablebins(alluseablebins, file, self.alluseablebins_loc) sum_counts = pd.Series(temp_bininfo['counts']) sum_counts = sum_counts[autosomebinsonly].sum(skipna=True) autoscaledtemp = pd.Series( temp_bininfo['counts'].loc[(autosomebinsonly)], copy=True) / sum_counts # NA-related code removed allscaledtemp = pd.Series( temp_bininfo['counts'].loc[(alluseablebins)], copy=True) / sum_counts gc_index = {} cnt = 0 for index, isauto in enumerate(autosomebinsonly): if isauto: if temp_bininfo['GC'].iat[index] in gc_index: gc_index[temp_bininfo['GC'].iat[index]].append( float(autoscaledtemp.iat[cnt])) cnt += 1 else: gc_index[temp_bininfo['GC'].iat[index]] = [ float(autoscaledtemp.iat[cnt]) ] cnt += 1 key_list = [] val_list = [] for key, val in gc_index.items(): key_list.append(key) val_list.append(np.median(val)) loess_var = loess(key_list, val_list) # default span : 0.75 loess_var.fit() # y = loess.loess_prediction(newData, loessVar) # temp_loessPredict.loess_debugging(loessVar) ###prediction### loess_x = [ float(gc) for index, gc in enumerate(temp_bininfo['GC']) if (alluseablebins[index]) ] # print(temp_bininfo['GC']) loess_fitted = loess_var.predict(loess_x) loess_fitted = list(loess_fitted.values) # print(loess_fitted) median_autoscaledtemp = np.median(autoscaledtemp) median_autoscaledtemp = float( median_autoscaledtemp) # for fixed constant normalizedbincount = [ (x + (median_autoscaledtemp - loess_fitted[index])) for index, x in enumerate(allscaledtemp) ] #CREATE normalizedbincount in "output_loc"/normalizedbincount create_normalizedbincount(normalizedbincount, file, self.normalizedbincount_loc) bincounts = pd.Series(data=np.repeat(a=0.0, repeats=61927), index=temp_bininfo['binName'], dtype=np.float64) sum_normalizedbincount = sum( [val for val in normalizedbincount if not math.isnan(val)]) sum_normalizedbincount = float( sum_normalizedbincount) # deep copy temporarily cnt = 0 for index, x in enumerate(alluseablebins): if x == True: data = (normalizedbincount[cnt] / sum_normalizedbincount) * len(normalizedbincount) bincounts.iat[index] = data cnt += 1 #CREATE bincounts in "output_loc"/bincounts create_bincounts(bincounts, file, self.bincounts_loc) wrsc = self.prediction(bincounts, self.B, self.mu, self.parameter_1, self.parameter_2) enet = np.dot(bincounts, (self.elnetbeta)) + (self.elnetintercept) ff = (wrsc + enet) / 2 result_lines = list() result_lines.append("SeqFF\tEnet\tWRSC") result_lines.append("{}\t{}\t{}".format(ff, enet, wrsc)) #CREATE results of seqff (seqff paper result covered) in "output_loc"/results create_results(result_lines, file, self.results_loc) end = time.time() elapsed = end - start h = int(elapsed) // 3600 m = (int(elapsed) - (h * 3600)) // 60 s = (int(elapsed) % 60) print("elapsed time: %d hr %d min %d sec" % (h, m, s)) print("elapsed :", elapsed) print("progress : {} / {}".format(i + 1, self.progress))