def parse_limma_result(r_fit): ans = {} # Convert to pandas for col in [ 'coefficients', 'cov.coefficients', 'stdev.unscaled', 't', 'p.value', 'lods' ]: ans[col] = to_dataframe(r_dollar(r_fit, col)) fit_df = {} ans['var.prior'] = pd.Series(numpy2ri.rpy2py(r_dollar(r_fit, 'var.prior')), index=ans['coefficients'].columns) # Additionally convert numpy arrays to pandas series for col in [ 'df.prior', 'df.residual', 'sigma', 'Amean', 'df.total', 'F', 'F.p.value', 's2.post' ]: np_array = numpy2ri.rpy2py(r_dollar(r_fit, col)) fit_df[col] = pd.Series(np_array, index=ans['coefficients'].index) # These ones need some extra nudge for col in ['rank', 'method', 's2.prior', 'proportion']: np_array = numpy2ri.rpy2py(r_dollar(r_fit, col)) # These are only one number for whole dataset assert len(np_array) == 1 fit_df[col] = pd.Series(np_array[0], index=ans['coefficients'].index) fit_df = pd.DataFrame(fit_df) ans['fit'] = fit_df return ans
def _rpy2py(X): import rpy2.robjects as ro from rpy2.robjects import numpy2ri try: return numpy2ri.rpy2py(X) except NotImplementedError: return ro.conversion.rpy2py(X)
def rpy2py_single_cell_experiment(obj: SexpS4) -> AnnData: with localconverter(default_converter): s4v = importr("S4Vectors") se = importr("SummarizedExperiment") sce = importr("SingleCellExperiment") assay_names = se.assayNames(obj) if not isinstance(assay_names, NULLType): assay_names = [str(a) for a in se.assayNames(obj)] # The assays can be stored in an env or elsewise so we don’t use obj.slots['assays'] assays = [ numpy2ri.rpy2py(assay).T for assay in (se.assay(obj, n) for n in assay_names) ] # There’s SingleCellExperiment with no assays exprs, layers = assays[0], dict(zip(assay_names[1:], assays[1:])) assert len(exprs.shape) == 2, exprs.shape else: exprs, layers = None, {} rdim_names = sce.reducedDimNames(obj) if not isinstance(rdim_names, NULLType): rdim_names = [str(t) for t in rdim_names] reduced_dims = [ numpy2ri.rpy2py(rd) for rd in (sce.reducedDim(obj, t) for t in rdim_names) ] obsm = { conv_name.sce2scanpy(n): d for n, d in zip(rdim_names, reduced_dims) } else: obsm = None col_data = se.colData(obj) row_data = se.rowData(obj) metadata = s4v.metadata(obj) obs = rpy2py_data_frame(col_data) var = rpy2py_data_frame(row_data) # The whole shebang: configured converter, numpy, pandas and ours with localconverter(full_converter()): uns = dict(metadata.items()) return AnnData(exprs, obs, var, uns, obsm, layers=layers)
def rpy2py_intvector(obj): # special case for factors if 'factor' in obj.rclass: res = pandas.Categorical.from_codes(numpy.asarray(obj) - 1, categories = obj.do_slot('levels'), ordered = 'ordered' in obj.rclass) else: res = numpy2ri.rpy2py(obj) return res
def rpy2py_intvector(obj): # special case for factors if 'factor' in obj.rclass: codes = [x - 1 if x > 0 else -1 for x in numpy.array(obj)] res = pandas.Categorical.from_codes(codes, categories=list( obj.do_slot('levels')), ordered='ordered' in obj.rclass) else: res = numpy2ri.rpy2py(obj) return res
def test_cpm_normalization(): given = np.array([ [5, 4, 3], [2, 1, 4], [3, 4, 6], [4, 2, 8], ]) from rpy2.robjects import numpy2ri from rpy2.robjects.packages import importr edgeR = importr('edgeR') expectation = numpy2ri.rpy2py(edgeR.cpm(numpy2ri.py2rpy(given))) assert np.allclose(cpm_normalize(given), expectation, atol=1e-2)
def rpy2py_floatvector(obj): # special case for POSIXct date objects if 'POSIXct' in obj.rclass: tzone_name = obj.do_slot('tzone')[0] if tzone_name == '': # R is implicitly using the local timezone, while Python time libraries # will assume UTC. tzone = get_timezone() else: tzone = pytz.timezone(tzone_name) foo = (tzone.localize(datetime.fromtimestamp(x)) for x in obj) res = pandas.to_datetime(tuple(foo)) else: res = numpy2ri.rpy2py(obj) return res
def rpy2py_floatvector(obj): # special case for POSIXct date objects if 'POSIXct' in obj.rclass: try: tzone_name = obj.do_slot('tzone')[0] except LookupError: warnings.warn('R object inheriting from "POSIXct" but without ' 'attribute "tzone".') tzone_name = '' if tzone_name == '': # R is implicitly using the local timezone, while Python # time libraries will assume UTC. tzone = get_timezone() else: tzone = pytz.timezone(tzone_name) foo = (tzone.localize(datetime.fromtimestamp(x)) for x in obj) res = pandas.to_datetime(tuple(foo)) else: res = numpy2ri.rpy2py(obj) return res
def ri2py_vector(obj): res = numpy2ri.rpy2py(obj) return res
def test_atomic_vector_to_numpy(self): v = robjects.vectors.IntVector((1, 2, 3)) a = rpyn.rpy2py(v) assert isinstance(a, numpy.ndarray) assert v[0] == 1
def rpy2py_floatvector(obj): if POSIXct.isrinstance(obj): return rpy2py(POSIXct(obj)) else: return numpy2ri.rpy2py(obj)
def rpy2py_floatvector(obj): return numpy2ri.rpy2py(obj)
def on_click(self, event): """ Event handler """ # Event does not apply for time series plot # Check if the click was in a if event.inaxes not in [self.left_p, self.right_p]: return # Clear subplots self.observed.clear() self.trend.clear() self.seasonal.clear() self.resid.clear() self.climatology.clear() # Delete last reference point if len(self.left_p.lines) > 0: del self.left_p.lines[0] del self.right_p.lines[0] # Draw a point as a reference self.left_p.plot(event.xdata, event.ydata, marker='o', color='red', markersize=7, alpha=0.7) self.right_p.plot(event.xdata, event.ydata, marker='o', color='red', markersize=7, alpha=0.7) # Non-masked data left_plot_sd = self.left_ds.sel(longitude=event.xdata, latitude=event.ydata, method='nearest') # Sinlge year dataset single_year_ds = self.single_year_ds.sel(longitude=event.xdata, latitude=event.ydata, method='nearest') if left_plot_sd.chunks is not None: left_plot_sd = left_plot_sd.compute() single_year_ds = single_year_ds.compute() # Seasonal decompose ts_df = left_plot_sd.to_dataframe() self.seasonal_decompose = seasonal_decompose( ts_df[self.data_vars.value], model=self.model.value, freq=self.single_year_ds.shape[0], extrapolate_trend='freq') # Plot seasonal decompose self.observed.plot(self.seasonal_decompose.observed.index, self.seasonal_decompose.observed.values, label='Observed') # MK test _mk_test = mk_test(self.seasonal_decompose.trend) self.trend.plot(self.seasonal_decompose.trend.index, self.seasonal_decompose.trend.values, label=f'Trend {_mk_test}') # Set the same y limits from observed data self.trend.set_ylim(self.observed.get_ylim()) self.seasonal.plot(self.seasonal_decompose.seasonal.index, self.seasonal_decompose.seasonal.values, label='Seasonality') self.resid.plot(self.seasonal_decompose.resid.index, self.seasonal_decompose.resid.values, label='Residuals') # Climatology sbn.boxplot(ts_df.index.dayofyear, ts_df[self.data_vars.value], ax=self.climatology) # Plot year to analyse single_year_df = single_year_ds.to_dataframe() sbn.stripplot(single_year_df.index.dayofyear, single_year_df[self.data_vars.value], color='red', marker='o', size=7, alpha=0.7, ax=self.climatology) self.climatology.tick_params(axis='x', rotation=70) # Change point r_vector = FloatVector(self.seasonal_decompose.trend.values) #changepoint_r = self.cpt.cpt_mean(r_vector) #changepoints_r = self.cpt.cpt_var(r_vector, method='PELT', # penalty='Manual', pen_value='2*log(n)') changepoints_r = self.cpt.cpt_meanvar(r_vector, test_stat='Normal', method='BinSeg', penalty="SIC") changepoints = numpy2ri.rpy2py(self.cpt.cpts(changepoints_r)) if changepoints.shape[0] > 0: # Plot vertical line where the changepoint was found for i, i_cpt in enumerate(changepoints): i_cpt = int(i_cpt) + 1 cpt_index = self.seasonal_decompose.trend.index[i_cpt] if i == 0 : self.trend.axvline(cpt_index, color='black', lw='1.0', label='Change point') else: self.trend.axvline(cpt_index, color='black', lw='1.0') # Legend self.observed.legend(loc='best', fontsize='small', fancybox=True, framealpha=0.5) self.observed.set_title('Time series decomposition') self.trend.legend(loc='best', fontsize='small', fancybox=True, framealpha=0.5) self.seasonal.legend(loc='best', fontsize='small', fancybox=True, framealpha=0.5) self.resid.legend(loc='best', fontsize='small', fancybox=True, framealpha=0.5) #self.climatology.legend([self.years.value], loc='best', self.climatology.legend(loc='best', fontsize='small', fancybox=True, framealpha=0.5) self.climatology.set_title('Climatology') # Grid self.observed.grid(axis='both', alpha=.3) self.trend.grid(axis='both', alpha=.3) self.seasonal.grid(axis='both', alpha=.3) self.resid.grid(axis='both', alpha=.3) self.climatology.grid(axis='both', alpha=.3) # Redraw plot plt.draw()
def glove_main(): # Load data from nlp parsing with open('{}/articles-with-equations.json'.format(settings.data_dir), 'r', encoding='utf-8') as jf: src_data = json.load(jf) texts = [ src_data[art]['text'] for art in src_data if src_data[art]['text'] is not None ] # The "unidecode" step simplifies non-ASCII chars which # mess up the R GloVe engine. texts_df = pd.Series(texts).apply(lambda x: unidecode(x)) texts_df = pd.DataFrame({'text': texts_df}) # Source all the functions contained in the 'trainEmbeddings' R file r("source('{}/trainEmbeddings.R'.format('src/data'))") # Call the main GloVe-embedding function from the R script trainEmbeddings_R = r("trainEmbeddings") # Train domain-specific GloVe embedding model and ouput as a Numpy Matrix pandas2ri.activate() DS_embeddings_R = trainEmbeddings_R(texts_df) pandas2ri.deactivate() DS_embeddings = numpy2ri.rpy2py(DS_embeddings_R[0]) # Get domain-specific GloVe vocabulary domain_spec_vocab = list(DS_embeddings_R[1]) # Load in Stanford's 'Common Crawl' domain-general Glove Embedding Model # Only pull out the words that are contained in our corpus # * This can take a while (~30min) - could use some optimization * DG_embeddings = loadGloveModel( '{}/glove.42B.300d.txt'.format(settings.data_dir), domain_spec_vocab) # Processing to ensure rows match between the domain-general and # domain-specific embeddings # Convert domain-general embedding from dictionary to array domain_gen_vocab = np.array( [DG_embeddings[i] for i in DG_embeddings.keys()]) # Find the indices of matching words both = set(domain_gen_vocab).intersection(domain_spec_vocab) indices_gen = [domain_gen_vocab.index(x) for x in both] indices_spec = [domain_spec_vocab.index(x) for x in both] indices_spec_notDG = [ domain_spec_vocab.index(x) for x in domain_spec_vocab if x not in both ] # Sort and subset domain-specific array to match indices of domain-general # array DS_embeddings_subset = DS_embeddings[indices_spec, :].copy() DG_embeddings_subset = DG_embeddings[indices_gen, :].copy() # fit cca model cca_res, DA_embeddings = domain_adapted_CCA(DG_embeddings_subset, DS_embeddings_subset, NC=100) DS_embeddings_notinDG = DS_embeddings[indices_spec_notDG, :] DS_embeddings_notinDG_norm = zscore(DS_embeddings_notinDG) DA_notinDG_embeddings = cca_res.y_weights_.T @ DS_embeddings_notinDG_norm.T DA_embeddings_final = np.append(DA_embeddings, DA_notinDG_embeddings.T, axis=0) # write data to disk np.savetxt('{}/da_embeddings.txt'.format(settings.models_dir), DA_embeddings_final, fmt='%d')
def on_click(self, event): """ Event handler """ # Event does not apply for time series plot if event.inaxes not in [self.left_p, self.right_p]: return # Clear subplots self.observed.clear() self.trend_p.clear() self.seasonal_p.clear() self.resid_p.clear() self.climatology.clear() # Delete last reference point if len(self.left_p.lines) > 0: del self.left_p.lines[0] del self.right_p.lines[0] # Draw a point as a reference self.left_p.plot(event.xdata, event.ydata, marker='o', color='red', markersize=7, alpha=0.7) self.right_p.plot(event.xdata, event.ydata, marker='o', color='red', markersize=7, alpha=0.7) # Non-masked data left_plot_sd = self.left_ds.sel(longitude=event.xdata, latitude=event.ydata, method='nearest') # Sinlge year dataset single_year_ds = self.single_year_ds.sel(longitude=event.xdata, latitude=event.ydata, method='nearest') if left_plot_sd.chunks is not None: left_plot_sd = left_plot_sd.compute() single_year_ds = single_year_ds.compute() ts_df = left_plot_sd.to_dataframe() # Mann-Kendall test _mk_test = mk_test(left_plot_sd.data, _round=3) # Observations + peaks and valleys self.observed.plot(left_plot_sd.time, left_plot_sd.data, label=f'Observed {_mk_test}') distance = int(np.ceil(len(self.single_year_ds.time) / 4)) peaks, _ = find_peaks(left_plot_sd.data, distance=distance) self.observed.plot(left_plot_sd.time[peaks], left_plot_sd[peaks], label=f'Peaks [{peaks.shape[0]}]', marker='x', color='C1', alpha=0.3) valleys, _ = find_peaks(left_plot_sd.data*(-1), distance=distance) self.observed.plot(left_plot_sd.time[valleys], left_plot_sd[valleys], label=f'Valleys, [{valleys.shape[0]}]', marker='x', color='C2', alpha=0.3) # Seasonal decompose period = int(self.bandwidth.currentText()) nobs = len(left_plot_sd) # TODO interpolate and extrapolate trend trend = left_plot_sd.rolling(time=period, min_periods=1, center=True).mean() self.trend_p.plot(left_plot_sd.time.data, trend.data, label=f'Trend (window = {period})') period_averages = left_plot_sd.groupby("time.dayofyear").mean() if self.model.currentText()[0] == 'a': period_averages -= period_averages.mean(axis=0) seasonal = np.tile(period_averages.T, nobs // period + 1).T[:nobs] resid = (left_plot_sd - trend) - seasonal else: period_averages /= period_averages.mean(axis=0) seasonal = np.tile(period_averages.T, nobs // period + 1).T[:nobs] resid = left_plot_sd / seasonal / trend self.seasonal_p.plot(left_plot_sd.time.data, seasonal, label='Seasonality') self.resid_p.plot(left_plot_sd.time.data, resid.data, label='Residuals') # Set the same y limits from observed data self.trend_p.set_ylim(self.observed.get_ylim()) # Climatology sbn.boxplot(ts_df.index.dayofyear, ts_df[self.data_vars.currentText()], ax=self.climatology) # Plot year to analyse single_year_df = single_year_ds.to_dataframe() sbn.stripplot(x=single_year_df.index.dayofyear, y=single_year_df[self.data_vars.currentText()], color='red', marker='o', size=7, alpha=0.7, ax=self.climatology) self.climatology.tick_params(axis='x', rotation=70) # Change point r_vector = FloatVector(trend.data) #changepoint_r = self.cpt.cpt_mean(r_vector) #changepoints_r = self.cpt.cpt_var(r_vector, method='PELT', # penalty='Manual', pen_value='2*log(n)') changepoints_r = self.cpt.cpt_meanvar(r_vector, test_stat='Normal', method='BinSeg', penalty="SIC") changepoints = numpy2ri.rpy2py(self.cpt.cpts(changepoints_r)) if changepoints.shape[0] > 0: # Plot vertical line where the changepoint was found for i, i_cpt in enumerate(changepoints): i_cpt = int(i_cpt) + 1 cpt_index = self.seasonal_decompose.trend.index[i_cpt] if i == 0 : self.trend_p.axvline(cpt_index, color='black', lw='1.0', label='Change point') else: self.trend_p.axvline(cpt_index, color='black', lw='1.0') # Legend self.observed.legend(loc='best', fontsize='small', fancybox=True, framealpha=0.5) self.observed.set_title('Time series decomposition') self.trend_p.legend(loc='best', fontsize='small', fancybox=True, framealpha=0.5) self.seasonal_p.legend(loc='best', fontsize='small', fancybox=True, framealpha=0.5) self.resid_p.legend(loc='best', fontsize='small', fancybox=True, framealpha=0.5) #self.climatology.legend([self.years.value], loc='best', self.climatology.legend(loc='best', fontsize='small', fancybox=True, framealpha=0.5) self.climatology.set_title('Climatology') # Grid self.observed.grid(axis='both', alpha=.3) self.trend_p.grid(axis='both', alpha=.3) self.seasonal_p.grid(axis='both', alpha=.3) self.resid_p.grid(axis='both', alpha=.3) self.climatology.grid(axis='both', alpha=.3) # Redraw plot plt.draw()
def on_pbCPD_click(self): """ Compute change points on the detrended time series """ # Wait cursor QtWidgets.QApplication.setOverrideCursor(Qt.WaitCursor) msg = f"Identifying change points..." self.progressBar.setEnabled(True) self.progressBar.setFormat(msg) self.progressBar.setValue(1) # Compute first the trend period = int(self.bandwidth.currentText()) nobs = len(self.left_ds) # Get data type dtype = self.left_ds.dtype # Get trend based on a moving window trend = self.left_ds.rolling(time=period, min_periods=1, center=True).mean().astype(dtype) trend = trend.compute() trend.attrs = self.left_ds.attrs # Output data output = xr.zeros_like(trend).astype(np.int16).load() output.attrs = self.left_ds.attrs layers, rows, cols = trend.shape for x in range(cols): self.progressBar.setValue(int((x / cols) * 100)) for y in range(rows): _data = trend[:,y,x] r_vector = FloatVector(_data) #changepoint_r = self.cpt.cpt_mean(r_vector) #changepoints_r = self.cpt.cpt_var(r_vector, method='PELT', # penalty='Manual', pen_value='2*log(n)') # CPD methods _method = 'BinSeg' _penalty = 'SIC' changepoints_r = self.cpt.cpt_meanvar(r_vector, test_stat='Normal', method=_method, penalty=_penalty) changepoints = numpy2ri.rpy2py( self.cpt.cpts(changepoints_r)).astype(int) if changepoints.shape[0] > 0: output[changepoints+1, y, x] = True fname = (f'{os.path.splitext(self.fname)[0]}' f'_change_points.tif') msg = f"Saving change points..." self.progressBar.setFormat(msg) self.progressBar.setValue(1) save_dask_array(fname=fname, data=output, data_var=self.data_vars.currentText(), method=None, n_workers=4, progressBar=self.progressBar) self.progressBar.setValue(0) self.progressBar.setEnabled(False) # Standard cursor QtWidgets.QApplication.restoreOverrideCursor()
def rpy2py_listvector(obj): if 'data.frame' in obj.rclass: res = rpy2py(DataFrame(obj)) else: res = numpy2ri.rpy2py(obj) return res
def to_series(r_vector, name=None): index = numpy2ri.rpy2py(r_vector.names) values = numpy2ri.rpy2py(r_vector) return pd.Series(values, index=index, name=name)