Exemplo n.º 1
0
def get_IRIS_package_versions(logger):
    """
    Return a dataframe of version information for IRIS R packages used in ISPAQ.
    """
    IRIS_packages = ['seismicRoll','IRISSeismic','IRISMustangMetrics']
    
    # Get version information for locally installed and CRAN available IRIS_packages
    r_installed = robjects.r("installed.packages()[c('seismicRoll','IRISSeismic','IRISMustangMetrics'),'Version']")
    installed_versions = pandas2ri.ri2py(r_installed).tolist()
    r_available = robjects.r("available.packages()[c('seismicRoll','IRISSeismic','IRISMustangMetrics'),'Version']")
    cran_versions = pandas2ri.ri2py(r_available).tolist()
    
    # Find any 'old' installed packages that available for an upgrade
    r_old = robjects.r("old.packages()[,'Package']")
    old = pandas2ri.ri2py(r_old).tolist()
    
    # Create a needsUpgrade array
    upgrade = [False, False, False]
    for i in range(len(IRIS_packages)):
        if IRIS_packages[i] in old:
            upgrade[i] = True
        
    # Put information in a dataframe
    df = pd.DataFrame({'package': IRIS_packages,
                       'installed': installed_versions,
                       'CRAN': cran_versions,
                       'upgrade': upgrade})
    # Reorder columns from default alphabetic
    df = df[['package','installed','CRAN','upgrade']]
    
    return(df)
Exemplo n.º 2
0
def load_data() -> pd.DataFrame:
    importr('faraway')
    r.data('chredlin');
    chredlin = pandas2ri.ri2py(r.chredlin)
    chredlin = chredlin.set_index(pandas2ri.ri2py(r.chredlin.rownames))
    chredlin['log_income'] = np.log(chredlin['income'])
    return chredlin
Exemplo n.º 3
0
def apply_PSD_metric(r_stream, *args, **kwargs):
    """"
    Invoke the PSDMetric and convert the R dataframe result into
    a Pandas dataframe.
    :param r_stream: an r_stream object
    :return:
    """
    R_function = robjects.r('IRISMustangMetrics::PSDMetric')
    r_listOfLists = R_function(r_stream, *args, **kwargs)  # args and kwargs shouldn't be needed in theory
    r_metriclist = r_listOfLists[0]
    r_dataframe = _R_metricList2DF(r_metriclist)
    df = pandas2ri.ri2py(r_dataframe)
    
    # Convert columns from R POSIXct to pyton UTCDateTime
    df.starttime = df.starttime.apply(UTCDateTime)
    df.endtime = df.endtime.apply(UTCDateTime)
    
    # TODO:  What to do about the list of spectraMetrics?
    # TODO:  We would need a new R_spectrumMetricList2DF function to process this further.
    ###r_spectrumList = r_listOfLists[1]
    
    # correctedPSD is returned as a dataframe
    r_correctedPSD = r_listOfLists[2]
    correctedPSD = pandas2ri.ri2py(r_correctedPSD)
    # Convert columns from R POSIXct to pyton UTCDateTime
    correctedPSD.starttime = correctedPSD.starttime.apply(UTCDateTime)
    correctedPSD.endtime = correctedPSD.endtime.apply(UTCDateTime)

    r_PDF = r_listOfLists[3]
    PDF = pandas2ri.ri2py(r_PDF)
    
    return (df, correctedPSD, PDF)
Exemplo n.º 4
0
 def convert_fit_to_python(self, fit):
     coeffs_r = fit.rx2('coefficients')
     coeffs = pandas2ri.ri2py(coeffs_r)
     coeff_names = pandas2ri.ri2py(coeffs_r.names).tolist()
     coeff_series = pd.Series({k: v for k, v in zip(coeff_names, coeffs)})
     fitted_values = pandas2ri.ri2py(fit.rx2('fitted.values'))
     return coeff_series
Exemplo n.º 5
0
def race_predict(df):
    # todo: why are there missing counties?
    df = df.query('(county != "None") and (county == county)')
    df.set_index([[1]], inplace=True)

    r = robjects.r
    pandas2ri.activate()
    wru = importr('wru')  # https://github.com/kosukeimai/wru

    # df.loc[3, 'surname'] = 'Althaus'

    # df.dropna(inplace=True)
    df['age'] = df['age'].apply(lambda x: round(x))

    census_data = joblib.load('data_files/census_data_all_states_county.pkl')
    X_out = wru.predict_race(voter_file=df,
                             census_geo='county',
                             census_key=census_key,
                             sex=True,
                             age=True,
                             census_data=census_data)
    print(pandas2ri.ri2py(X_out))

    census_data = joblib.load('data_files/census_data_all_states_tract.pkl')
    X_out = wru.predict_race(voter_file=df,
                             census_geo='tract',
                             census_key=census_key,
                             sex=True,
                             age=True,
                             census_data=census_data)
    print(pandas2ri.ri2py(X_out))
Exemplo n.º 6
0
def test_sum_stats_save_load(history: History):
    arr = sp.random.rand(10)
    arr2 = sp.random.rand(10, 2)
    particle_list = [
        Particle(m=0,
                 parameter=Parameter({"a": 23, "b": 12}),
                 weight=.2,
                 accepted_sum_stats=[{"ss1": .1, "ss2": arr2,
                                      "ss3": example_df(),
                                      "rdf0": r["faithful"]}],
                 # TODO: check why iris fails
                 accepted_distances=[.1]),
        Particle(m=0,
                 parameter=Parameter({"a": 23, "b": 12}),
                 weight=.2,
                 accepted_sum_stats=[{"ss12": .11, "ss22": arr,
                                      "ss33": example_df(),
                                      "rdf": r["mtcars"]}],
                 accepted_distances=[.1])]

    history.append_population(0, 42,
                              Population(particle_list), 2, ["m1", "m2"])
    weights, sum_stats = history.get_weighted_sum_stats_for_model(0, 0)
    assert (weights == 0.5).all()
    assert sum_stats[0]["ss1"] == .1
    assert (sum_stats[0]["ss2"] == arr2).all()
    assert (sum_stats[0]["ss3"] == example_df()).all().all()
    assert (sum_stats[0]["rdf0"] == pandas2ri.ri2py(r["faithful"])).all().all()
    assert sum_stats[1]["ss12"] == .11
    assert (sum_stats[1]["ss22"] == arr).all()
    assert (sum_stats[1]["ss33"] == example_df()).all().all()
    assert (sum_stats[1]["rdf"] == pandas2ri.ri2py(r["mtcars"])).all().all()
Exemplo n.º 7
0
 def read_data(self, data=None, df_name=None):
     if df_name is None:
         df_name = self.df_name
     if isinstance(data, type(None)):
         if self.data_type == 'dataframe':
             self.data = pd.read_csv(self.data_source)
         elif self.data_type == 'Rdata' and rpy2_imported:
             robjects.r['load'](self.data_source)
             for df_name in [self.df_name, "vz", "vcdb", "healthcare"]:
                 try:
                     self.data = pandas2ri.ri2py(robjects.r[df_name])
                     self.data_type = "dataframe"
                     break
                 except LookupError:
                     self.data = None
             if isinstance(self.data, type(None)):
                 raise LookupError("Could not find dataframe name in Rdata file. please specify with df_name=<name>")
         elif self.data_type == 'json':
             self.data = []
             for path in self.data_source:
                 self.data += [os.path.join(dirpath, f) for dirpath, dirnames, files in os.walk(path) for f in files if f.endswith('.json')]
         else:
             raise ValueError("Data type not supported.  If datatype is Rdata, please make sure rpy2 loaded correctly.")
     else:
         self.data = data
         if type(data) is pd.core.frame.DataFrame:
             self.data_type = 'dataframe'
         elif type(data) is list:
             self.data_type = 'json'
         elif type(data) is rpy2.robjects.vectors.DataFrame and rpy2_imported:
             self.data = pandas2ri.ri2py(self.data)
             self.data_type = 'dataframe'
         else:
             raise ValueError("type of data boject is unrecognized.  If data object is Rdata, please make sure rpy2 loaded correctly.")
Exemplo n.º 8
0
    def items(self, n_rows_cached=100, include_rsid=None):
        """
        Retrieve generator of variants, one by one. Although variants are returned in the order as they are stored in
        the BGEN file, when there are variants with the same positions their order is not guaranteed.
        :param n_rows_cached:
        :return:
        """
        # retrieve positions
        if include_rsid is not None:
            stm = 'select distinct rsid, position from Variant where rsid in ({}) order by file_start_position asc'.format(
                ', '.join(["'{}'".format(x) for x in include_rsid]))
        else:
            stm = 'select distinct rsid, position from Variant order by file_start_position asc'

        with sqlite3.connect(self.bgi_path) as conn:
            cur = conn.cursor()
            cur.execute(stm)

            iteration = 0

            while True:
                if iteration > 0:
                    cached_data_struct = cached_data.__sexp__
                    del cached_data
                    del cached_data_struct
                    gc.collect()

                positions = cur.fetchmany(size=n_rows_cached)
                if not positions:
                    break

                rsids = [x[0] for x in positions]
                positions = [x[1] for x in positions]

                if include_rsid is None:
                    ranges = pd.DataFrame({
                        'chromosome': [self.chr_number],
                        'start': [positions[0]],
                        'end': [positions[-1]],
                    })

                    # rbgen = importr('rbgen')
                    cached_data = self.rbgen.bgen_load(self.bgen_path, ranges)

                else:
                    cached_data = self.rbgen.bgen_load(self.bgen_path,
                                                       rsids=StrVector(rsids))

                all_variants = pandas2ri.ri2py(cached_data[0])
                all_probs = pandas2ri.ri2py(cached_data[4])

                iteration += 1

                for row_idx, (rsid, row) in enumerate(all_variants.iterrows()):
                    dosage_row = row.rename({'chromosome': 'chr'})
                    dosage_row['chr'] = int(dosage_row.chr)
                    dosage_row['dosages'] = np.dot(all_probs[row_idx, :, :],
                                                   [0, 1, 2])

                    yield dosage_row
Exemplo n.º 9
0
def get_IRIS_package_versions(logger):
    """
    Return a dataframe of version information for IRIS R packages used in ISPAQ.
    """
    IRIS_packages = ['seismicRoll','IRISSeismic','IRISMustangMetrics']
    
    # Get version information for locally installed and CRAN available IRIS_packages
    r_installed = robjects.r("installed.packages()[c('seismicRoll','IRISSeismic','IRISMustangMetrics'),'Version']")
    installed_versions = pandas2ri.ri2py(r_installed).tolist()
    r_available = robjects.r("available.packages()[c('seismicRoll','IRISSeismic','IRISMustangMetrics'),'Version']")
    cran_versions = pandas2ri.ri2py(r_available).tolist()
    
    # Find any 'old' installed packages that available for an upgrade
    r_old = robjects.r("old.packages()[,'Package']")
    old = pandas2ri.ri2py(r_old).tolist()
    
    # Create a needsUpgrade array
    upgrade = [False, False, False]
    for i in range(len(IRIS_packages)):
        if IRIS_packages[i] in old:
            upgrade[i] = True
        
    # Put information in a dataframe
    df = pd.DataFrame({'package': IRIS_packages,
                       'installed': installed_versions,
                       'CRAN': cran_versions,
                       'upgrade': upgrade})
    # Reorder columns from default alphabetic
    df = df[['package','installed','CRAN','upgrade']]
    
    return(df)
Exemplo n.º 10
0
def extract_dataframe_from_R(dataframe_name):
	temp = pandas2ri.ri2py(r(dataframe_name))
	temp_rows = pandas2ri.ri2py(r("rownames(" + dataframe_name + ")"))
	temp_cols = np.float32(pandas2ri.ri2py(r("colnames(" + dataframe_name + ")")))

	df = pd.DataFrame(data = temp, columns = temp_cols, index = temp_rows)
	return df
    def fit_and_predict(self, train, horizon):
        r_string = """
            function(data, frequency, horizon){
                library(forecast)
                ts_data <- ts(data, frequency=frequency)

                fit <- HoltWinters(ts_data)
                fitted_df <- data.frame(fit$fitted)

                forecast <- forecast(fit, h = horizon)
                forecast_df <- data.frame(forecast)

                output <- list(fitted_df, forecast_df)
                return(output)
            }
        """

        r_func = robjects.r(r_string)

        # Run R
        pandas2ri.activate()
        output_list = r_func(train, self.frequency, horizon)
        fit = pandas2ri.ri2py(output_list[0])
        forecast = pandas2ri.ri2py(output_list[1])
        pandas2ri.deactivate()

        return fit, forecast
Exemplo n.º 12
0
    def predict(self, xtest):
        """Predicts class via majority vote.

        Parameters
        ----------
        xtest : pd.DataFrame
            features for test set
        """
        if new_pandas_flag:
            r_xtest = pandas2ri.py2ri(xtest)
        else:
            r_xtest = com.convert_to_r_dataframe(xtest)
        #r_xtest = pandas2ri.py2ri(xtest)
        pred = self.rf_pred(self.rf, r_xtest)
        if new_pandas_flag:
            #py_pred = pandas2ri.ri2py(pred)
            tmp_genes = pred[1]
            tmp_pred_class = pred[0]
            genes = pandas2ri.ri2py(tmp_genes)
            pred_class = pandas2ri.ri2py(tmp_pred_class)
        else:
            py_pred = com.convert_robj(pred)
            genes, pred_class = zip(*py_pred.items())
            #genes = com.convert_robj(tmp_genes)
            #pred_class = com.convert_robj(tmp_pred_class)
        tmp_df = pd.DataFrame({'pred_class': pred_class},
                              index=genes)
        tmp_df = tmp_df.reindex(xtest.index)
        tmp_df -= 1  # for some reason the class numbers start at 1
        return tmp_df['pred_class']
Exemplo n.º 13
0
    def predict(self, xtest):
        """Predicts class via majority vote.

        Parameters
        ----------
        xtest : pd.DataFrame
            features for test set
        """
        if new_pandas_flag:
            r_xtest = pandas2ri.py2ri(xtest)
        else:
            r_xtest = com.convert_to_r_dataframe(xtest)
        #r_xtest = pandas2ri.py2ri(xtest)
        pred = self.rf_pred(self.rf, r_xtest)
        if new_pandas_flag:
            #py_pred = pandas2ri.ri2py(pred)
            tmp_genes = pred[1]
            tmp_pred_class = pred[0]
            genes = pandas2ri.ri2py(tmp_genes)
            pred_class = pandas2ri.ri2py(tmp_pred_class)
        else:
            py_pred = com.convert_robj(pred)
            genes, pred_class = zip(*py_pred.items())
            #genes = com.convert_robj(tmp_genes)
            #pred_class = com.convert_robj(tmp_pred_class)
        tmp_df = pd.DataFrame({'pred_class': pred_class},
                              index=genes)
        tmp_df = tmp_df.reindex(xtest.index)
        tmp_df -= 1  # for some reason the class numbers start at 1
        return tmp_df['pred_class']
    def fit_and_predict(self, train, horizon):
        r_string = """
            function(data, frequency, horizon){
                library(forecast)

                if(length(frequency) == 1){
                    ts_data <- ts(data, frequency=frequency)
                }else{
                    ts_data <- msts(data, seasonal.periods=frequency)
                }

                fit <- tbats(ts_data)
                fitted_df <- data.frame(fit$fitted.values)

                forecast <- forecast(fit, h = horizon)
                forecast_df <- data.frame(forecast)

                output <- list(fitted_df, forecast_df)
                return(output)
            }
        """

        r_func = robjects.r(r_string)

        pandas2ri.activate()
        output_list = r_func(train, robjects.IntVector(self.frequency),
                             horizon)
        fit = pandas2ri.ri2py(output_list[0])
        forecast = pandas2ri.ri2py(output_list[1])
        pandas2ri.deactivate()

        return fit, forecast
	def read_spss_to_df(self):
		"""Use R functions to read SPSS files

		Input ->
		NULL
		====================================================================================================
		Output ->
		Return a tuple of a python DataFrame and an np array of descriptions of column names (i.e. features descriptions)
		"""
		from rpy2.robjects import r
		from string import Template
		from rpy2.robjects import pandas2ri
		import unicodedata
		file_location = self._file_path # or "./1 - 110778/110778.sav"
		file_location_csv = file_location[:-4] + ".csv"
		r_code = Template('''
		library(foreign)
		library(plyr)

		df <- read.spss ("$origin_file", to.data.frame=TRUE)
		desc <- attr(df,"variable.labels")
		write.csv(df, file="$output_file", na="")
		''')
		r_code = r_code.substitute(origin_file=file_location, output_file=file_location_csv) # Substitute input and output file with variables presented in python
		r(r_code) # Run the above r code in r global environment

		df = pandas2ri.ri2py(r('df')) # convert from r data frame into pandas data frame
		df = df.applymap(lambda x: unicodedata.normalize('NFKD', x).encode('ascii','ignore') if type(x) == unicode else x) # Translate unicode encoding into ascii encoding

		desc = pandas2ri.ri2py(r('desc')) # convert into python variable
		for j, ele in enumerate(desc):
			if type(desc[j]) == np.unicode_:
				desc[j] = str(unicodedata.normalize('NFKD', desc[j]).encode('ascii','ignore')) # http://stackoverflow.com/questions/1207457/convert-a-unicode-string-to-a-string-in-python-containing-extra-symbols
		desc = desc.astype(np.string_)
		return df, desc
Exemplo n.º 16
0
    def _predict_one(self, fitted):
        """

        Parameters
        ----------
        fitted

        Returns
        -------

        """
        # lagged conditional variances
        h = pandas2ri.ri2py(fitted.slots['h.t'])[-self.garch_lags:]
        h = pd.Series(data=h, index=self.garch_names)

        # lagged squared residuals
        eps = pandas2ri.ri2py(fitted.slots['residuals'])[-self.arch_lags:]**2
        eps = pd.Series(data=eps, index=self.arch_names)

        # 1 to be multiplied with omega
        omega = pd.Series(data=[1], index=["omega"])

        # all together
        data = pd.concat((h, eps, omega))

        res = self.get_coef(fitted).drop("mu", errors="ignore").dot(data)

        return res
Exemplo n.º 17
0
 def convert_fit_to_python(self, fit):
     coeffs_r = fit.rx2('coefficients')
     coeffs= pandas2ri.ri2py(coeffs_r)
     coeff_names =  pandas2ri.ri2py(coeffs_r.names).tolist()
     coeff_series = pd.Series({k:v for k,v in zip(coeff_names, coeffs)})
     fitted_values = pandas2ri.ri2py(fit.rx2('fitted.values'))
     return coeff_series
Exemplo n.º 18
0
def ref_estimate_cell_counts(input_r_object_dir, algorithm, reference, library,
                             output_csv):
    """Reference based cell type estimates."""
    import rpy2.robjects as robjects
    from rpy2.robjects import pandas2ri, numpy2ri

    pandas2ri.activate()
    from pymethylprocess.meffil_functions import est_cell_counts_meffil, est_cell_counts_minfi, est_cell_counts_IDOL
    os.makedirs(output_csv[:output_csv.rfind('/')], exist_ok=True)
    read_r_object = robjects.r('readRDS')
    robjects.r(
        'library({})'.format(algorithm if algorithm != 'IDOL' else 'minfi'))
    if algorithm == 'meffil':
        qc_list = read_r_object(join(input_r_object_dir, 'QCObjects.rds'))
        cell_counts = est_cell_counts_meffil(qc_list, reference)
    else:
        rgset = read_r_object(join(input_r_object_dir, 'RGSet.rds'))
        if algorithm == 'meffil':
            cell_counts = est_cell_counts_minfi(rgset)
        else:
            cell_counts = est_cell_counts_IDOL(rgset, library)

    # find where samples intersect
    pandas2ri.ri2py(
        robjects.r('as.data.frame')(cell_counts)).to_csv(output_csv)
Exemplo n.º 19
0
def test_sum_stats_save_load(history: History):
    arr = sp.random.rand(10)
    arr2 = sp.random.rand(10, 2)
    particle_list = [
        Particle(0, Parameter({
            "a": 23,
            "b": 12
        }), .2, [.1], [{
            "ss1": .1,
            "ss2": arr2,
            "ss3": example_df(),
            "rdf0": r["iris"]
        }], [], True),
        Particle(0, Parameter({
            "a": 23,
            "b": 12
        }), .2, [.1], [{
            "ss12": .11,
            "ss22": arr,
            "ss33": example_df(),
            "rdf": r["mtcars"]
        }], [], True)
    ]
    history.append_population(0, 42, Population(particle_list), 2,
                              ["m1", "m2"])
    weights, sum_stats = history.get_sum_stats(0, 0)
    assert (weights == 0.5).all()
    assert sum_stats[0]["ss1"] == .1
    assert (sum_stats[0]["ss2"] == arr2).all()
    assert (sum_stats[0]["ss3"] == example_df()).all().all()
    assert (sum_stats[0]["rdf0"] == pandas2ri.ri2py(r["iris"])).all().all()
    assert sum_stats[1]["ss12"] == .11
    assert (sum_stats[1]["ss22"] == arr).all()
    assert (sum_stats[1]["ss33"] == example_df()).all().all()
    assert (sum_stats[1]["rdf"] == pandas2ri.ri2py(r["mtcars"])).all().all()
Exemplo n.º 20
0
    def plot_qc_metrics(self, output_dir):
        """Plot QC results from ENmix pipeline and possible minfi. Still experimental.

        Parameters
        ----------
        output_dir
            Where to store plots."""
        self.enmix.plotCtrl(self.RGset)
        grdevice = importr("grDevices")
        geneplotter = importr("geneplotter")
        base = importr('base')
        anno=self.minfi.getAnnotation(self.RGset)
        anno_py = pandas2ri.ri2py(robjects.r['as'](anno,'data.frame'))
        beta_py = pandas2ri.ri2py(self.beta)
        beta1=numpy2ri.py2ri(beta_py[anno_py["Type"]=="I"])
        beta2=numpy2ri.py2ri(beta_py[anno_py["Type"]=="II"])
        grdevice.jpeg(output_dir+'/dist.jpg',height=900,width=600)
        base.par(mfrow=robjects.vectors.IntVector([3,2]))
        self.enmix.multidensity(self.beta, main="Multidensity")
        self.enmix.multifreqpoly(self.beta, xlab="Beta value")
        self.enmix.multidensity(beta1, main="Multidensity: Infinium I")
        self.enmix.multifreqpoly(beta1, main="Multidensity: Infinium I", xlab="Beta value")
        self.enmix.multidensity(beta2, main="Multidensity: Infinium II")
        self.enmix.multifreqpoly(beta2, main="Multidensity: Infinium II", xlab="Beta value")
        grdevice.dev_off()
        self.minfi.qcReport(self.RGset, pdf = "{}/qcReport.pdf".format(output_dir))
        self.minfi.mdsPlot(self.RGset)
        self.minfi.densityPlot(self.RGset, main='Beta', xlab='Beta')
def lmm_analysis(dataframe, filename, output, target):
    """
    Perform a glmm analysis of the data of interest.

    :param dataframe: (pandas DataFrame) a dataframe
    :param filename: (string) list of float
    """
    r_df = pandas2ri.py2ri(dataframe)
    stat_s = r("""
    require("DHARMa")
    require(lme4)
    require("MASS")
    function(data, name, target){
        mod <- lmer(%s ~ size_tad + (1|Tad), data=data)
        nulmod <- lm(%s ~ size_tad, data=data)
        simulationOutput <- simulateResiduals(fittedModel = mod, n = 250)
        png(paste(name, "/mod_dignostics_%s.png", sep=""), height=1080, width=1920)
        par(mfrow=c(2, 2))
        plot(simulationOutput)
        dev.off()
        simulationOutput <- simulateResiduals(fittedModel = nulmod, n = 250)
        png(paste(name, "/nulmod_dignostics_%s.png", sep=""), height=1080, width=1920)
        par(mfrow=c(2, 2))
        plot(simulationOutput)
        dev.off()
        return(anova(mod, nulmod, test="Chisq"))
    }

    """ % (target, target, target, target))

    res = stat_s(r_df, output, target)
    print(res)
    pandas2ri.ri2py(res).to_csv("%s_glmm_stats.txt" % filename,
                                sep="\t",
                                index=False)
Exemplo n.º 22
0
    def krige(self, i=0, v=None, step=1, res=True, plot_v=False, plot_k=True, animated=False, **plot_kwargs):
        """
        Krige the dataframe with a single data column or a column index number

        Parameters
        -------
		self : Event object with at least one data column
		
		kwargs
		-------
        i : int data column index number (defaults to 0)
        v : variogram to use in determining sill and range
        step : grid interval to krige on (in km)
		res : bool detrend points before computing kriged values - default True
		plot_v : bool plot variogram - default False
		plot_k : bool plot kriged values - default True
		animated : bool return axis for animation - default False

        **plot_kwargs (cmap, s, latlon, basemap, shpfile, POT, locs, colors)

        Returns
        -------
        k : Dataframe containing output from r-krige function
        """
        from rpy2.robjects import pandas2ri
        pandas2ri.activate()
        rfuncs = import_r_tools()
        
        if 'X' not in self.ll_cols:
            self.set_ll()
        
        if res:
            if not hasattr(self, 'res'):
                self.detrend()
            df = self.res
        else:
            df = self.df
        cols = self.data_cols
        
        r_df = df.loc[:,['X', 'Y', cols[i]]].dropna(how='any')
        if not v:
            v = pandas2ri.ri2py(rfuncs.get_variogram(r_df))

        model = 'Sph'
        psill = r_df.var()[cols[i]]
        for j in range(len(v)):
            if v.gamma[j] > psill:
                rng = v.dist[j]
                break
        k = pandas2ri.ri2py(rfuncs.get_krige(r_df, psill, model, rng, step=step))
        k['lat'] = k.y/110.574
        k['lon'] = k.x/(111.320*(k['lat']*pi/180).apply(cos))
        self.k = k
        if plot_k and animated:
            return self.plot_krige(i, k, rng, step=step, res=res, animated=animated, **plot_kwargs)
        elif plot_k and not animated:
            self.plot_krige(i, k, rng, step=step, res=res, animated=animated, **plot_kwargs)
        else:
            return k
Exemplo n.º 23
0
 def test_fit_with_pandas_data(self, Model, dataframe):
     X, y = dataframe
     model = Model(scriptname='myscript', funcname='myfunc', some='kwarg')
     model.fit(X, y)
     funcargs = model.r['myfunc'].call_args
     assert (ri2py(funcargs[0][0]).values == X.values).all()
     assert (ri2py(funcargs[0][1]) == y).all()
     assert funcargs[1]['some'] == 'kwarg'
Exemplo n.º 24
0
    def get_features(self, d={}, thresh=.01, sigma=3, min_size=4, const=5, return_dict=False, buffer=False):
        '''
        Use r package SpatialVx to identify features.

        Parameters
        ----------
        thresh: .01
        sigma: 3
        min_size: 4
        const: 5
        buffer: False

        Return
        ------
        p: pd.Panel containing parameters characterizing the features found
        '''
        from rpy2 import robjects
        from rpy2.robjects.packages import importr
        from rpy2.robjects import pandas2ri
        pandas2ri.activate()
        SpatialVx = importr('SpatialVx')
        rsummary = robjects.r.summary
        r_tools = import_r_tools()

        ll = np.array([self.lon.flatten('F'), self.lat.flatten('F')]).T
        for i in range(self.box.shape[0]-1):
            hold = SpatialVx.make_SpatialVx(self.box[i,:,:], self.box[i+1,:,:], loc=ll)
            look = r_tools.FeatureFinder_gaussian(hold, nx=self.box.shape[2], ny=self.box.shape[1],
                                                  thresh=thresh, smoothpar=sigma, **(dotvars(min_size=min_size)))
            try:
                x = rsummary(look, silent=True)[0]
            except:
                continue
            px = pandas2ri.ri2py(x)
            df0 = pd.DataFrame(px, columns=['centroidX', 'centroidY', 'area', 'OrientationAngle',
                                          'AspectRatio', 'Intensity0.25', 'Intensity0.9'])
            df0['Observed'] = list(df0.index+1)
            m = SpatialVx.centmatch(look, criteria=3, const=const)
            p = pandas2ri.ri2py(m[12])
            df1 = pd.DataFrame(p, columns=['Forecast', 'Observed'])
            l = SpatialVx.FeatureMatchAnalyzer(m)
            try:
                p = pandas2ri.ri2py(rsummary(l, silent=True))
            except:
                continue
            df2 = pd.DataFrame(p, columns=['Partial Hausdorff Distance','Mean Error Distance','Mean Square Error Distance',
                                          'Pratts Figure of Merit','Minimum Separation Distance', 'Centroid Distance',
                                          'Angle Difference','Area Ratio','Intersection Area','Bearing', 'Baddeleys Delta Metric',
                                          'Hausdorff Distance'])
            df3 = df1.join(df2)

            d.update({self.time[i]: pd.merge(df0, df3, how='outer')})
        if return_dict:
            return(d)
        p = pd.Panel(d)
        if buffer:
            return(self.add_buffer(p))
        return(p)
def cfit_to_df(dgelist, c_fit):
    cpm = edgeR.cpm(dgelist, log=True)
    cpm_df = as_data_frame(cpm)
    cpm_df.index = pandas2ri.ri2py(robj.r('rownames')(cpm))
    tt = edgeR.topTags(c_fit, n=np.inf, adjust_method='BH', sort_by='none')
    tt_df = as_data_frame(tt)
    tt_df.index = pandas2ri.ri2py(robj.r('rownames')(tt))
    tt_df = tt_df.join(cpm_df, how='outer')
    tt_df.index.name = 'gene_id'
    return tt_df
Exemplo n.º 26
0
def DEA(counts, design_r, contrasts=None, adjust_method='BH'):
    """
    contrasts needs to be a dictionary with string keys and values to be used as design contrasts or 
    a list of int values to be used as design coefficients

    Returns results and pd.DataFrame of normalized counts
    """
    if isinstance(contrasts, list):
        coefs = True
        if not all(isinstance(k, int) for k in contrasts):
            raise ValueError('coefficient list should be all integers')
        coefnames = list(ro.r.colnames(design_r))
        contrasts = OrderedDict([(coefnames[c - 1], c) for c in contrasts])
    else:
        coefs = False
        if not (all(isinstance(k, str) for k in contrasts)
                and all(isinstance(v, str) for v in contrasts.values())):
            raise ValueError('contrast dict should be all string pairs')

    # import R limma package
    limma = importr('limma')
    # tranform counts with voom
    voomedCounts_r = limma.voom(counts,
                                design=design_r,
                                plot=True,
                                normalize="quantile")
    fit_r = limma.lmFit(voomedCounts_r, design_r)
    fit_r = limma.eBayes(fit_r)
    coefficients_r = fit_r.rx2('coefficients')  #fit_r$coefficients
    if coefs:
        fit_contrasts_r = fit_r
    else:
        contrasts_r, contrasts_p = prepareContrasts(design_r,
                                                    contrasts.values(),
                                                    RReturnOnly=False)
        fit_contrasts_r = limma.contrasts_fit(fit_r, contrasts_r)
        fit_contrasts_r = limma.eBayes(fit_contrasts_r)
    print(ro.r.summary(fit_contrasts_r))

    #Full results
    results = OrderedDict()
    for res in contrasts:
        result_r = limma.topTable(fit_contrasts_r,
                                  coef=contrasts[res],
                                  n=len(counts),
                                  adjust_method=adjust_method)
        results[res] = pandas2ri.ri2py(result_r)
        results[res].index = ro.r.rownames(result_r)
        #results[res]['gene_label'] = results[res].index.map(lambda x: counts.index[int(x)-1])
        print('# sig', res, '->', (results[res]['adj.P.Val'] <= 0.05).sum())

    return results, pd.DataFrame(pandas2ri.ri2py(voomedCounts_r.rx2('E')),
                                 columns=counts.columns,
                                 index=counts.index)
Exemplo n.º 27
0
 def predict(self, indep_vars):
     ro.globalenv['test'] = pandas2ri.py2ri(indep_vars)
     ro.globalenv['fit'] = self.fitted_model
     if self.algorithm == "rprop+":
         return pandas2ri.ri2py(
             ro.r("compute(fit,test)$net.result")
         )
     elif self.algorithm == "ADAPTgdwm":
         return pandas2ri.ri2py(
             ro.r("sim(fit$net, test)")
         )
Exemplo n.º 28
0
def get_wunifrac_distance(phyloseq_d):
    R_phyloseq = importr('phyloseq')
    R_base = importr('base')
        
    distances = R_phyloseq.UniFrac(phyloseq_d, weighted=True, normalized=True, fast=True, parallel=False)
    distance_mat = R_base.as_matrix(distances)
    distance_df = pd.DataFrame(numpy2ri.ri2py(distance_mat),
                               index=pandas2ri.ri2py(R_phyloseq.sample_names(phyloseq_d)),
                                columns=pandas2ri.ri2py(R_phyloseq.sample_names(phyloseq_d))
                            )
    return distance_df
Exemplo n.º 29
0
def get_distance(phyloseq_d, dist_method):
    R_phyloseq = importr('phyloseq')
    R_base = importr('base')
        
    distances = R_phyloseq.distance(phyloseq_d, method=dist_method)
    distance_mat = R_base.as_matrix(distances)
    distance_df = pd.DataFrame(numpy2ri.ri2py(distance_mat),
                               index=pandas2ri.ri2py(R_phyloseq.sample_names(phyloseq_d)),
                                columns=pandas2ri.ri2py(R_phyloseq.sample_names(phyloseq_d))
                            )
    return distance_df
Exemplo n.º 30
0
Arquivo: ets.py Projeto: lulzzz/mtsg
def ets_1(train,test,hor=24,freq=24):
	pandas2ri.activate()
	forecast=importr('forecast') # forecast package
	ts=ro.r.ts # R time series
	fitted=ro.r('fitted') # function exporting forecasts used while fitting model
	r_train_ts=ts(train,frequency=freq) # construct R's ts object
	r_test_ts=ts(test,frequency=freq) # construct R's ts object
	fit_train=forecast.ets(r_train_ts) # find best model on train test
	fit_test=forecast.ets(r_test_ts,model=fit_train) # get predictions on test set
	train_pred=pd.Series(pandas2ri.ri2py(fitted(fit_train)),index=train.index) # reconstruct pandas DataFrame from R float vector
	test_pred=pd.Series(pandas2ri.ri2py(fitted(fit_test)),index=test.index) # reconstruct pandas DataFrame from R float vector
	return train_pred,test_pred
Exemplo n.º 31
0
 def assays(self, rs4_assays):
     list_vector = pandas2ri.ri2py(rs4_assays.slots["listData"])
     self._assays = dict()
     for assay, label in zip(list_vector, list_vector.names):
         if type(assay) == robjects.methods.RS4:
             non_zero_elements = assay.slots["x"]
             row_numbers =pandas2ri.ri2py(assay.slots["i"])
             column_pointers = pandas2ri.ri2py(assay.slots["p"])
             nrows = len(list(pandas2ri.ri2py(assay.slots["Dimnames"]))[0])
             self._assays[label] = SingleCellExperiment.DCGtoCSR(non_zero_elements, row_numbers, column_pointers, nrows)
         elif type(assay) == robjects.vectors.Matrix:
             self._assays[label] = csr_matrix(pandas2ri.ri2py(assay))
Exemplo n.º 32
0
def getRAnoval(formula, data):
    '''
    returns the data analysed by Kruskal wallis in 'R' using rpy2 module
    '''
    model1 = robjects.r.lm(formula=formula, data=data)
    anv = robjects.r.anova(model1)
    postHocHSD = agr.HSD_test(model1, 'genotype', group=False, console=False)
    postHoc = pd.DataFrame(pandas2ri.ri2py(postHocHSD.rx2('comparison')))
    smry1 = pd.DataFrame(pandas2ri.ri2py(anv))
    pVal = smry1['Pr(>F)']['genotype']
    fValue = smry1['F value']['genotype']
    return {'pvalue': pVal, 'fvalue': fValue, 'posthoc': postHoc}
def train_elastic_net_wrapper(features_data_, features_, d_, data_annotation_, x_w=None, prune=True, nested_folds=10):
    x = numpy.array([features_data_[v] for v in features_.id.values])
    dimnames = robjects.ListVector(
        [(1, robjects.StrVector(d_["individual"])), (2, robjects.StrVector(features_.id.values))])
    x = robjects.r["matrix"](robjects.FloatVector(x.flatten()), ncol=features_.shape[0], dimnames=dimnames)
    y = robjects.FloatVector(d_[data_annotation_.gene_id])
    nested_folds = robjects.FloatVector([nested_folds])
    #py2ri chokes on None.
    if x_w is None:
        res = train_elastic_net(y, x, n_train_test_folds=nested_folds)
    else:
        res = train_elastic_net(y, x, penalty_factor=x_w, n_train_test_folds=nested_folds)  # observation weights, not explanatory variable weight :( , x_weight = x_w)
    return pandas2ri.ri2py(res[0]), pandas2ri.ri2py(res[1])
Exemplo n.º 34
0
def _parse_assayData(assayData, assay):
    """Parse Rpy2 assayData (Environment object)

    assayData: Rpy2 Environment object.
    assay: An assay name indicating the data to be loaded.

    Return a parsed expression dataframe (Pandas).
    """
    pandas2ri.activate()
    mat = assayData[assay]  # rpy2 expression matrix object
    data = pandas2ri.ri2py(mat)
    features = pandas2ri.ri2py(r.rownames(mat))
    samples = pandas2ri.ri2py(r.colnames(mat))
    return pd.DataFrame(data, index=features, columns=samples)
Exemplo n.º 35
0
def bumfit(p_vals, tau):
    b = pandas2ri.py2ri(pd.Series(p_vals))
    c = dunn.Bum(b)
    d = obase.summary(c, tau)
    at_sym = base.__dict__["@"]
    estimates = at_sym(d, "estimates")
    bum = at_sym(d, "bum")
    #pvals = at_sym(bum, "pvals")
    ahat = pandas2ri.ri2py(at_sym(bum, "ahat"))
    lhat = pandas2ri.ri2py(at_sym(bum, "lhat"))
    pihat = pandas2ri.ri2py(at_sym(bum, "pihat"))
    q = pandas2ri.ri2py(estimates)
    p = pd.DataFrame([ahat, lhat, pihat], index=['ahat', 'lhat', 'pihat'])
    return q, ahat
Exemplo n.º 36
0
def run_fcs(ticker, debugTF=False, funcName='rForecast', **optx):
    # get data
    datax = pull_stock_data(ticker)
    asof = int(datax['pbdate'].iloc[-1])
    #	idxtm=map(lambda x:datetime.datetime.strptime(str(x),"%Y%m%d"),datax['pbdate'])
    #	datax.set_index(pd.DatetimeIndex(idxtm),inplace=True)
    if debugTF is True:
        print datax.tail()

    # get r-code
    pandas2ri.activate()
    rstring = 'source("./_alan_ohlc_fcs.r")'
    r(rstring)

    # convert to r-data
    #df=pandas2ri.py2ri(datax[['pbdate','close']])
    df = pandas2ri.py2ri(datax['close'][:])

    # run r-function
    opts = {
        'nfcs': 30,
        'dwmTF': True,
        'autoArima': False,
        'difTF': True,
        'funcname': 'rAR',
        'logTF': True,
        'plevel': 0.7,
        'freq': 'W'
    }
    opts.update(optx)
    optR = subDict(opts, [
        'nfcs', 'plevel', 'funcname', 'autoArima', 'logTF', 'difTF', 'freq',
        'fcsLst', 'dwmTF'
    ])
    if debugTF:
        print >> sys.stderr, "==Input Args:{}".format(optR)
        print >> sys.stderr, "==asof {},df:\n{}".format(
            asof, datax['close'][-5:])
    if funcName in robj.globalenv:
        funcArg = robj.globalenv[funcName]
    ret = funcArg(df, asof, debugTF=debugTF, **optR)
    if opts['dwmTF'] is True:
        dwm = pandas2ri.ri2py(ret[1])
        dwm['ticker'] = ticker
    else:
        dwm = pd.DataFrame()
    dd = pandas2ri.ri2py(ret[0])
    dd['ticker'] = ticker
    return (dd, dwm, datax)
Exemplo n.º 37
0
def apply_PSD_metric(r_stream, *args, **kwargs):
    """"
    Invoke the PSDMetric and convert the R dataframe result into
    a Pandas dataframe.
    :param r_stream: an r_stream object
    :param (optional kwarg) evalresp= pandas dataframe of FAP from evalresp (freq,amp,phase)
    :return: tuple of GeneralValueMetrics, corrected PSD, and PDF
    """

    R_function = robjects.r('IRISMustangMetrics::PSDMetric')
    pandas2ri.activate()

    # look for optional parameter evalresp=pd.DataFrame
    evalresp = None
    if 'evalresp' in kwargs:
        evalresp = kwargs['evalresp']

    r_listOfLists = None
    if evalresp is not None:
        r_evalresp = pandas2ri.py2ri(evalresp)  # convert to R dataframe
        r_listOfLists = R_function(r_stream, evalresp=r_evalresp)
    else:
        r_listOfLists = R_function(r_stream)

    r_metriclist = r_listOfLists[0]
    if r_metriclist:
        r_dataframe = _R_metricList2DF(r_metriclist)
        df = pandas2ri.ri2py(r_dataframe)
        # Convert columns from R POSIXct to python UTCDateTime
        df.starttime = df.starttime.apply(UTCDateTime)
        df.endtime = df.endtime.apply(UTCDateTime)

    # PSDMetric returns no PSD derived metrics
    else:
        df = pd.DataFrame()

    # correctedPSD is returned as a dataframe
    r_correctedPSD = r_listOfLists[2]
    PSDCorrected = pandas2ri.ri2py(r_correctedPSD)

    # Convert columns from R POSIXct to python UTCDateTime
    PSDCorrected.starttime = PSDCorrected.starttime.apply(UTCDateTime)
    PSDCorrected.endtime = PSDCorrected.endtime.apply(UTCDateTime)

    r_PDF = r_listOfLists[3]
    PDF = pandas2ri.ri2py(r_PDF)
    pandas2ri.deactivate()

    return (df, PSDCorrected, PDF)
Exemplo n.º 38
0
def getRKrusWall(formula, data):
    '''
    returns the data analysed by Kruskal wallis in 'R' using rpy2 module
    '''
    krsWall = statsR.kruskal_test(formula=formula, data=data)
    krsWallPd = pd.DataFrame(pandas2ri.ri2py(krsWall.rx2('p.value')))
    pVal = krsWallPd[0][0]
    postHocDunn = fsa.dunnTest(formula, data=data, method='bh')
    postHoc = pd.DataFrame(pandas2ri.ri2py(postHocDunn.rx2('res')))
    chiSq = pd.DataFrame(pandas2ri.ri2py(krsWall.rx2('statistic')))
    return {
        'pvalue': pVal,
        'chi-squared': chiSq,
        'posthoc': postHoc.sort_values(by=['Comparison'])
    }
Exemplo n.º 39
0
def SCCA_r(X,Y, n_components, pen):
	df_X = pd.DataFrame(X)
	df_Y = pd.DataFrame(Y)
	rmat_X = pandas2ri.py2ri(df_X)
	rmat_Y = pandas2ri.py2ri(df_Y)
	ri.globalenv['X'] = rmat_X
	ri.globalenv['Y'] = rmat_Y

	out = PMA.CCA(x=X, z=Y, K=n_components, niter =100, standardize=False, penaltyx=pen[0], penaltyz=pen[1])
	df_u = pandas2ri.ri2py(out[1])
	df_v = pandas2ri.ri2py(out[2])
	cors = pandas2ri.ri2py(out[15])
	
	loadings = (np.asmatrix(df_u), np.asmatrix(df_v))
	return loadings, cors
Exemplo n.º 40
0
def computeRLEFactors(counts):
    """ Compute normalization size factors
    using the RLE method described in EdgeR and returns then as a vector.
    :param counts: a matrix of counts (genes as rows)
    :return returns the normalization factors a vector
    """
    pandas2ri.activate()
    r_counts = pandas2ri.py2ri(counts)
    edger = RimportLibrary("edgeR")
    as_matrix = r["as.matrix"]
    dds = edger.calcNormFactors(as_matrix(r_counts), method="RLE")
    pandas_sf = pandas2ri.ri2py(dds)
    pandas_cm = pandas2ri.ri2py(r.colSums(counts))
    pandas2ri.deactivate()
    return pandas_sf * pandas_cm
Exemplo n.º 41
0
def loadFile(fname, varname=None):
    """
    fname :  rdata or rds filename to be loaded
    varname : variable name inside rdata
    """
    if varname is not None:
        ro.r['load'](fname)
        full_data = pandas2ri.ri2py(ro.r[varname])
    else:  #assume it is in rds format
        full_data = pandas2ri.ri2py(ro.r['readRDS'](fname))
    if "date" in full_data.columns:
        full_data["date"] = pandas.to_datetime(full_data.date)
    if "idPolair" in full_data.columns:
        full_data["idPolair"] = full_data.idPolair.astype("category")
    return full_data
Exemplo n.º 42
0
def computeMnnBatchCorrection(counts):
    """Computes batch correction to a list of batches (data frames)
    where each data frame represents a batch (animal for instance).
    The batch correction is computed using Scran::mnnCorrect()
    from Marioni et al.
    :param counts: a list of matrices of counts
    :return returns a list of batch corrected matrices of counts
    """
    pandas2ri.activate()
    as_matrix = r["as.matrix"]
    meta = [(x.index,x.columns) for x in counts]
    r_counts = [as_matrix(pandas2ri.py2ri(x)) for x in counts]
    RimportLibrary("scran")
    r_call = """
        function(counts) {
           norm_counts = do.call(mnnCorrect, c(counts, cos.norm.out=FALSE));
           return(lapply(norm_counts$corrected, as.data.frame))
        }
    """
    r_func = r(r_call)
    norm_counts = list()
    for i,x in enumerate(r_func(r_counts)):
        norm_c = pandas2ri.ri2py(x)
        norm_c.index = meta[i][0]
        norm_c.columns = meta[i][1]
        norm_counts.append(norm_c)
    pandas2ri.deactivate()
    return norm_counts
Exemplo n.º 43
0
def computeSumFactors(counts, scran_clusters=True):
    """ Compute normalization factors
    using the deconvolution method
    described in Marioni et al.
    Returns the computed size factors as a vector.
    :param counts: a matrix of counts (genes as rows)
    :return returns the normalization factors a vector
    """
    n_cells = len(counts.columns)
    pandas2ri.activate()
    r_counts = pandas2ri.py2ri(counts)
    scran = RimportLibrary("scran")
    as_matrix = r["as.matrix"]
    if scran_clusters and n_cells >= 50:
        r_clusters = scran.quickCluster(as_matrix(r_counts),
                                        min(n_cells/10, 10),
                                        method="igraph")
        min_cluster_size = min(Counter(r_clusters).values())
        sizes = list(range(min(int(min_cluster_size/4), 10), 
                           min(int(min_cluster_size/2), 50), 5))
        dds = scran.computeSumFactors(as_matrix(r_counts), 
                                      clusters=r_clusters, sizes=sizes)
    else:
        sizes = list(range(min(int(n_cells/4), 10), 
                           min(int(n_cells/2), 50), 5))
        dds = scran.computeSumFactors(as_matrix(r_counts), sizes=sizes)        
    pandas_sf = pandas2ri.ri2py(dds)
    pandas2ri.deactivate()
    return pandas_sf
Exemplo n.º 44
0
def get_rdata(url):
    # For testing, probably want to do this a different way in production TODO
    response = urllib2.urlopen(url)
    html = response.read()
    fp = open("rdata" + url.replace("http://data.war-on-ice.net", "").replace("http://war-on-ice.com", ""), "w")
    fp.write(html)
    fp.close()
    robj = r.load("rdata" + url.replace("http://data.war-on-ice.net", "").replace("http://war-on-ice.com", ""))
    rdata = {}
    keys = {}
    for sets in robj:
        myRData = pandas2ri.ri2py(r[sets])
        rdata[sets] = []
        keys[sets] = set()
        # convert to DataFrame
        if not isinstance(myRData, pd.DataFrame):
            myRData = pd.DataFrame(myRData)
        for element in myRData:
            keys[sets].add(element)
            counter = 0
            for value in myRData[element]:
                if counter >= len(rdata[sets]):
                    rdata[sets].append({})
                rdata[sets][counter][element] = value
                counter += 1
    return rdata
Exemplo n.º 45
0
Arquivo: limma.py Projeto: mfiers/rat
def run_simple(A, B):

    from rpy2.robjects import pandas2ri
    from rpy2.robjects.packages import importr
    import rpy2.robjects as ro
    r = ro.r

    pandas2ri.activate()
    
    limma = importr('limma')
    edgeR = importr('edgeR')

    counts = pd.concat([A, B], 1)
    groups = r.factor(r.c(*([0] * A.shape[1] + [1] * B.shape[1])))
    ro.globalenv['exp'] = groups
                 
    design = r('model.matrix(~exp)')
    dge = r.DGEList(counts=counts)
    dge = r.calcNormFactors(dge)
    v = r.voom(dge, design, plot=False)
    fit = r.lmFit(v, design)
    fit = r.eBayes(fit)
    tt = r.topTable(fit, coef=r.ncol(design), number=1e12)
    ttidx = r['row.names'](tt)
    tt =  pandas2ri.ri2py(tt)
    cols = tt.columns.to_series()
    cols[0] = 'lfc'
    cols[3] = 'pval'
    cols[4] = 'padj'
    tt.columns = cols
    tt['slp'] = np.log10(tt['pval'])
    tt.loc[tt['lfc'] > 0, 'slp'] = -np.log10(tt.loc[tt['lfc'] > 0, 'pval'])
    tt.index = ttidx
    return tt
Exemplo n.º 46
0
    def variogram(self, i=0, plot_v=True, **kwargs):
        """
        Generate a variogram

        Parameters
        ----------
		self : Event object with at least one data column
        i : int data column index number (defaults to 0)
		plot_v : bool generate a plot of the variogram

        **kwargs (target_np, alpha, tol_hor, max_bnd, last_max)

        Returns
        -------
        v : Dataframe containing output from r-variogram function
        """
        from rpy2.robjects import pandas2ri
        pandas2ri.activate()
        rfuncs = import_r_tools()
        
        if 'X' not in self.ll_cols:
            self.set_ll()
            
        df = self.df
        cols = self.data_cols
        
        r_df = df.loc[:,['X', 'Y', cols[i]]].dropna(how='any')
        v = pandas2ri.ri2py(rfuncs.get_iSVG(r_df, 3, **kwargs))
        if plot_v:
            v.plot(x='dist', y='gamma', marker = 'o', figsize=(8,4))
        return v
Exemplo n.º 47
0
def find_extrema(se, window=5, span_points=25):
        #df = pd.DataFrame({'x': mpl.dates.date2num(x), 'y': y})
        x=se.index
        y=se
        df = pd.DataFrame({'x': x, 'y': y})

        span = span_points/len(df)
        lo = stats.loess('y~x', df, span=span, na_action=stats.na_exclude)
        # we have to use predict(lo) instead of lo.rx2('fitted') here, the latter 
        # doesn't not include NAs
        fitted = pd.Series(pandas2ri.ri2py(stats.predict(lo)), index=df.index)
        max_ = pd.rolling_max(fitted, window, center=True)
        min_ = pd.rolling_min(fitted, window, center=True)

        df['fitted'] = fitted
        df['max'] = max_
        df['min'] = min_

        delta = max_ - fitted
        highs = df[delta<=0]
        delta = min_ - fitted
        lows = df[delta>=0]

        #globals()['fe_df'] = df
        #globals()['x'] = x
        #globals()['y'] = y
        #globals()['lows'] = lows
        #globals()['highs'] = highs

        return fitted, lows, highs
def read_rdata(rdata_fullpath, table_name):
    """
    Returns the pandas DataFrame
    """
    from rpy2.robjects import pandas2ri, r
    pandas2ri.activate()

    # we want forward slashes for R
    rdata_fullpath_forR = rdata_fullpath.replace("\\", "/")
    print "Loading %s" % rdata_fullpath_forR
    
    # read in the data from the R session with python
    r['load'](rdata_fullpath_forR)
    # check that it's there
    table_df = pandas2ri.ri2py(r['model_summary'])

    # fillna
    for col in table_df.columns:
        nullcount = sum(pandas.isnull(table_df[col]))
        if nullcount > 0: print "  Found %5d NA values in column %s" % (nullcount, col)
    table_df = table_df.fillna(0)
    for col in table_df.columns:
        nullcount = sum(pandas.isnull(table_df[col]))
        if nullcount > 0: print "  -> Found %5d NA values in column %s" % (nullcount, col)
    
    print "Read %d lines from %s" % (len(table_df), rdata_fullpath)
    return table_df
Exemplo n.º 49
0
def mca( distance_matrix, dim = 2 ):
    """ calculate MCA matrix using R's FactorMineR """

    # build up haplotype dataframe

    from fatools.lib.utils import acquire_R, release_R
    from rpy2 import robjects
    from rpy2.robjects import pandas2ri

    acquire_R()

    r_df = pandas2ri.py2ri(distance_matrix.H)
    robjects.globalenv['haplo_data'] = r_df
    marker_len = len(distance_matrix.H.columns)
    arguments = ','.join('as.factor(haplo_data[,%d])' % x
                    for x in range(1, marker_len + 1))
    robjects.r('haplo_df <- data.frame(%s)' % arguments)
    robjects.r('library(FactoMineR)')
    mca_res = robjects.r('MCA(haplo_df, graph=FALSE)')

    # get the individual coordinate
    coord = pandas2ri.ri2py(mca_res.rx('ind')[0].rx('coord')[0])

    release_R()

    return (coord, None)
Exemplo n.º 50
0
def logCountsWithFactors(counts, size_factors):
    """ Uses the R package scater to log a matrix of counts (genes as rows)
    and a vector of size factor using the method normalize().
    :param counts: a matrix of counts (genes as rows)
    :param size_factors: a vector of size factors
    :return the normalized log counts (genes as rows)
    """
    columns = counts.columns
    indexes = counts.index
    pandas2ri.activate()
    r_counts = pandas2ri.py2ri(counts)
    scater = RimportLibrary("scran")
    r_call = """
        function(counts, size_factors){
          sce = SingleCellExperiment(assays=list(counts=as.matrix(counts)))
          sizeFactors(sce) = size_factors
          sce = normalize(sce)
          norm_counts = logcounts(sce)
          return(as.data.frame(norm_counts))
        }
    """
    r_func = r(r_call)
    r_norm_counts = r_func(r_counts, size_factors)
    pandas_norm_counts = pandas2ri.ri2py(r_norm_counts)
    pandas_norm_counts.index = indexes
    pandas_norm_counts.columns = columns
    pandas2ri.deactivate()
    return pandas_norm_counts
Exemplo n.º 51
0
def getEvalresp(network=None, station=None, location=None, channel=None,
                time=None, minfreq=None, maxfreq=None,
                nfreq=None, units=None, output="fap"):
    """
    Returns a pandas dataframe with cinstrument response data from the IRIS DMC evalresp webservice.
    :param network: sncl network (string)
    :param station: sncl station (string)
    :param location: sncl location (string)
    :param channel: sncl channel (string)
    :param time: ObsPy UTCDateTime object specifying the time at which the response is evaluated.
    :param minfreq: Optional minimum frequency at which the response is evaluated.
    :param maxfreq: Optional maximum frequency at which the response is evaluated.
    :param nfreq: Optional number of frequencies at which response will be evaluated.
    :param units: Optional code specifying unit conversion.
    :param output: Output type ['fap'|'cs'].
    :return: pandas dataframe of response metadata.

    .. rubric:: Example

    >>> df = getDistaz(-146, 45, 10, 10)
    >>> df
         azimuth  backAzimuth  distance
    1  241.57595     47.88017  39.97257
    """
    r_client = robjects.r('new("IrisClient")')
    
    # Convert python arguments to R equivalents
    time = R_POSIXct(time)
    (minfreq, maxfreq, nfreq, units, output) = _R_args(minfreq, maxfreq, nfreq, units, output)
    
    # Call the function and return a pandas dataframe with the results
    r_df = _R_getEvalresp(r_client, network, station, location, channel, time, minfreq, maxfreq, nfreq, units, output)
    df = pandas2ri.ri2py(r_df)
    return df
def read_r_to_python(path_I, path_II):
	"""Read variables stored in R data format, and then convert it into Python data frame or array
	This method is DEPRECIATED due to class <R_Python_Unilever>"""
	from rpy2.robjects import r
	from rpy2.robjects import pandas2ri
	import unicodedata

	tmp = r.readRDS(path_I) # read from r file
	df = pandas2ri.ri2py(tmp) # convert into pandas data frame
	df = df.applymap(lambda x: unicodedata.normalize('NFKD', x).encode('ascii','ignore') if type(x) == unicode else x) # Translate unicode encoding into ascii encoding

	tmp = r.readRDS(path_II)
	desc = pandas2ri.ri2py(tmp) # convert into python variable
	for j, ele in enumerate(desc):
		if type(desc[j]) == np.unicode_:
			desc[j] = str(unicodedata.normalize('NFKD', desc[j]).encode('ascii','ignore')) # http://stackoverflow.com/questions/1207457/convert-a-unicode-string-to-a-string-in-python-containing-extra-symbols
	desc = desc.astype(np.string_)
	return df, desc
Exemplo n.º 53
0
def conditionDESeq2(data_frame, header, alpha, res_dir):
    '''
    Perform DESeq2-based analysis of condition:time interaction
    dependent differential expression
    '''

    E.info("Differential expression testing for %s" % header)
    cols = data_frame.columns

    # py2ri requires activation
    pandas2ri.activate()
    counts = pandas2ri.py2ri(data_frame)

    des_times = ro.IntVector([x.split(".")[1] for x in cols])
    des_reps = ro.StrVector([x.split(".")[2] for x in cols])
    des_cond = ro.StrVector([x.split(".")[0] for x in cols])
    genes = ro.StrVector([x for x in data_frame.index])

    # setup counts table and design frame

    R('''suppressPackageStartupMessages(library("DESeq2"))''')
    R('''sink(file="/dev/null")''')
    R('''times <- as.factor(%s)''' % des_times.r_repr())
    R('''reps <- c(%s)''' % des_reps.r_repr())
    R('''condition <- c(%s)''' % des_cond.r_repr())
    R('''design <- data.frame(times, reps, condition)''')
    R('''counts <- data.frame(%s)''' % counts.r_repr())
    R('''genes <- c(%s)''' % genes.r_repr())
    R('''rownames(counts) <- genes''')
    R('''rownames(design) <- colnames(counts)''')

    # use DESeq() with LRT and reduced formula.  Use effect
    # size moderation

    R('''dds <- DESeqDataSetFromMatrix(countData=counts, '''
      '''colData=design, '''
      '''design=~reps + times + condition + times:condition)''')
    R('''dds <- DESeq(dds, test="LRT", '''
      '''reduced=~reps + times + condition, betaPrior=T)''')
    R('''res <- results(dds)[order(results(dds)$padj, na.last=T), ]''')
    R('''res.df <- data.frame(res)''')

    # generate dispersion and MA plots
    R('''png("%s/%s-dispersions.png")''' % (res_dir,
                                            header))
    R('''plotDispEsts(dds)''')
    R('''dev.off()''')

    R('''png("%s/%s-MAplot.png")''' % (res_dir,
                                       header))
    R('''plotMA(res, alpha=%0.3f, ylim=c(-5,5))''' % alpha)
    R('''dev.off()''')
    R('''sink(file=NULL)''')

    df = pandas2ri.ri2py(R['res.df'])

    return df
Exemplo n.º 54
0
def treeCutting(infile,
                expression_file,
                cluster_file,
                cluster_algorithm,
                deepsplit=False):
    '''
    Use dynamic tree cutting to derive clusters for each
    resampled distance matrix
    '''
    wgcna_out = "/dev/null"

    E.info("loading distance matrix")

    df = pd.read_table(infile, sep="\t",
                       header=0, index_col=0)
    df = df.fillna(0.0)
    genes = df.index
    genes_r = ro.StrVector([g for g in genes])

    # py2ri requires activation
    pandas2ri.activate()
    rdf = pandas2ri.py2ri(df)

    R.assign("distance_data", rdf)
    R.assign("gene_ids", genes_r)

    R('''sink(file='%(wgcna_out)s')''' % locals())
    R('''suppressPackageStartupMessages(library("WGCNA"))''')
    R('''suppressPackageStartupMessages(library("flashClust"))''')
    E.info("clustering data by %s linkage" % cluster_algorithm)
    R('''rownames(distance_data) <- gene_ids''')
    R('''clustering <- flashClust(as.dist(distance_data),'''
      ''' method='%(cluster_algorithm)s')''' % locals())
    if deepsplit:
        R('''cluster_cut <- cutreeDynamic(dendro=clustering, '''
          '''minClusterSize=50, deepSplit=T)''')
    else:
        R('''cluster_cut <- cutreeDynamic(dendro=clustering, '''
          '''minClusterSize=50, deepSplit=F)''')

    R('''color_cut <- labels2colors(cluster_cut)''')
    R('''write.table(color_cut, file = '%(cluster_file)s','''
      '''sep="\t")''' % locals())
    R('''cluster_matched <- data.frame(cbind(rownames(distance_data),'''
      '''color_cut))''')
    R('''colnames(cluster_matched) = c("gene_id", "cluster")''')
    R('''cluster_matched <- data.frame(cluster_matched$gene_id,'''
      '''cluster_matched$cluster)''')
    R('''sink(file=NULL)''')

    cluster_frame = pandas2ri.ri2py(R["cluster_matched"])
    cluster_frame.columns = ['gene_id', 'cluster']
    cluster_frame.index = cluster_frame['gene_id']
    cluster_frame.drop(['gene_id'], inplace=True, axis=1)

    return cluster_frame
Exemplo n.º 55
0
def pandas_load(name):
    '''
    loads .rdata file (R dataframe file) and returns it as Pandas dataframe.
    :param name: .rdata filename (eg: 'subset.Rdata')
    :return: pandas dataframe object
    '''
    pandas2ri.activate()
    r.load(name)  # name = 'subset.fcuk.Rdata'
    # name_without_ext = r['.'.join(name.split('.')[-2::-1][::-1])]
    # print(r.ls())  # ls() - list of active objects in R env
    df = pandas2ri.ri2py(r[r.ls()[0]])
    return df
Exemplo n.º 56
0
 def clust_read(self):
     base = importr('base')
     #Fetch $ form the instance's dictionary of attributes
     dolar = base.__dict__['$']
     clust = dolar(self.clust_obj, 'widths')
     clus_width = dolar(self.clust_obj, 'clus.avg.widths')
     avg_width = dolar(self.clust_obj, 'avg.width')
     #Convert to pandas object
     self.cl_width = pandas2ri.ri2py(clus_width)
     self.avg_wid = pandas2ri.ri2py(avg_width)
     #pylist1 = pandas2ri.ri2py_dataframe(clus) CAMBIAR ALTERNATIVA
     pylist = com.convert_robj(clust)
     
     #Transform the first data object of cluster information
     data = pylist.reset_index()
     df = data.set_index('cluster')
     df.rename(columns = {'index':'win_id'}, inplace = True)
     dd = df.reset_index()
     #Create a list with cluster number and win_id, transform to dataframe 
     gb = dd.groupby(('cluster'))
     result = gb['win_id'].unique()
     self.silinfo = result.to_frame()
Exemplo n.º 57
0
Arquivo: limma.py Projeto: mfiers/rat
def run2(counts, formula, normcounts = None):

    from rpy2.robjects import pandas2ri
    from rpy2.robjects.packages import importr
    import rpy2.robjects as ro
    r = ro.r

    pandas2ri.activate()
    
    limma = importr('limma')
    edgeR = importr('edgeR')


    design_matrix = counts.T.reset_index()[counts.columns.names]
    ro.globalenv['design.matrix'] = design_matrix
    design = r('as.data.frame(model.matrix(' + formula + ', data=design.matrix))')

    dge = r.DGEList(counts=counts)
    dge = r.calcNormFactors(dge)
    v = r.voom(dge, design, plot=False)
    ro.globalenv['v'] = v
    if not normcounts is None:
        r('write.table(v, "' + normcounts + '",sep="\t",quote = F,col.names = NA)')
        
    fit = r.lmFit(v, design)
    fit = r.eBayes(fit)

    rv = []

    print(r.ncol(design)[0])
    for i in range(1, r.ncol(design)[0]):
        colname = r.colnames(design)[i]
        tt = r.topTable(fit, coef=i, number=1e12)
        ttidx = r['row.names'](tt)
        tt =  pandas2ri.ri2py(tt)
        cols = tt.columns.to_series()
        cols[0] = 'lfc'
        cols[3] = 'pval'
        cols[4] = 'padj'
        tt.columns = cols
        tt['slp'] = np.log10(tt['pval'])
        tt.loc[tt['lfc'] > 0, 'slp'] = -np.log10(tt.loc[tt['lfc'] > 0, 'pval'])
        if r.ncol(design)[0] > 2:
            #prepend colname to columns - only if there are more factors
            cols = tt.columns.to_series().apply(lambda x: '{}_{}'.format(colname, x))
            tt.columns = cols
        tt.index = ttidx

        rv.append(tt)
    return pd.concat(rv, axis=1)
Exemplo n.º 58
0
    def transform(self, method="vst", inplace=True):
        '''
        perform transformation on counts table
        current methods are:
         - deseq2 variance stabalising transformation
         - deseq rlog transformation
        '''

        assert method in ["vst", "rlog"], ("method must be one of"
                                           "[vst, rlog]")

        method2function = {"vst": "varianceStabilizingTransformation",
                           "rlog": "rlog"}

        t_function = method2function[method]

        transform = R('''
        function(df){

        suppressMessages(library('DESeq2'))

        design = data.frame(row.names = colnames(df),
                            condition = seq(1, length(colnames(df))))

        dds <- suppressMessages(DESeqDataSetFromMatrix(
                 countData= df, colData = design, design = ~condition))

        transformed <- suppressMessages(%(t_function)s(dds))
        transformed_df <- as.data.frame(assay(transformed))

        return(transformed_df)
        }''' % locals())

        r_counts = pandas2ri.py2ri(self.table)
        df = pandas2ri.ri2py(transform(r_counts))
        # losing rownames for some reason during the conversion?!
        df.index = self.table.index

        if inplace:
            self.table = df
            # R replaces "-" in column names with ".". Revert back!
            self.table.columns = [x.replace(".", "-")
                                  for x in self.table.columns]
        else:
            tmp_counts = self.clone()
            tmp_counts.table = df
            tmp_counts.table.columns = [x.replace(".", "-")
                                        for x in tmp_counts.table.columns]
            return tmp_counts
Exemplo n.º 59
0
def computeSizeFactors(counts):
    """ Computes size factors using DESeq
    for the counts matrix given as input (Genes as rows
    and spots as columns).
    Returns the computed size factors as a vector.
    :param counts: a matrix of counts (genes as rows)
    :return returns the normalization factors a vector
    """
    pandas2ri.activate()
    r_counts = pandas2ri.py2ri(counts)
    deseq2 = RimportLibrary("DESeq2")
    dds = deseq2.estimateSizeFactorsForMatrix(r_counts)
    pandas_sf = pandas2ri.ri2py(dds)
    pandas2ri.deactivate()
    return pandas_sf
Exemplo n.º 60
0
def getTraveltime(latitude, longitude, depth, staLatitude, staLongitude):
    """
    Returns a pandas dataframe with seismic traveltime data from the IRIS DMC traveltime web service.
    :param latitude: Latitude of seismic event.
    :param longitude: Longitude of seismic event.
    :param staLatitude: Latitude of seismic station.
    :param staLongitude: Longitude of seismic station.
    :return: pandas dataframe with columns: ``distance, depth, phaseName, travelTime, rayParam, takeoff, incident, puristDistance, puristName``.
    """
    r_client = robjects.r('new("IrisClient")')
    
    # Call the function and return a pandas dataframe with the results
    r_df = _R_getTraveltime(r_client, latitude, longitude, depth, staLatitude, staLongitude)
    df = pandas2ri.ri2py(r_df)
    return df