def mc(self): r_obj = r['mc'] colnames = r.colnames(r_obj) obj = array(r_obj).T print(colnames) return Struct(dict(zip(colnames, obj)))
def model(self): def _fix_nones_list(column_list): return [ None if isinstance(x, NARealType) else x for x in column_list ] r_obj = r['model'] colnames = r.colnames(r_obj) table = [_fix_nones_list(x) for x in r_obj] return Struct(dict(zip(colnames, table)))
def parstat(self): obj = array(r['ParStat']).T colnames = r.colnames(obj) properties = [ "xopt", "x1per", "x99per", "x10per", "x90per", "xminus", "xplus", "mode", "mean", "sd", "skewness", "kurtosis" ] parameter_values = [ dict(zip(properties, obj[i])) for i in range(len(colnames)) ] return Struct(dict(zip(colnames, parameter_values)))
def run(self, data_object): """Read canned dataset from R to a pandas dataframe Returns: data_object (DataObject): DataObject instance terminate (bool): should we terminate the DAG? true or false """ dataset = self.node_config["dataset"] logging.info("Reading {} from R".format(dataset)) try: from rpy2.robjects.packages import importr, data except ImportError: # pragma: no cover raise ImportError( "This example needs Rpy2." "Please refer to the R requirements in the README" ) datasets = importr("datasets") r_env = data(datasets).fetch(dataset) import rpy2.robjects as robjects # why we do this: # > data(euro) # > euro # ATS BEF DEM ESP FIM FRF IEP ITL LUF NLG PTE # 13.760300 40.339900 1.955830 166.386000 5.945730 6.559570 0.787564 1936.270000 40.339900 2.203710 200.482000 # # > as.data.frame(euro) # euro # ATS 13.760300 # BEF 40.339900 # DEM 1.955830 data = robjects.r("as.data.frame(%s)" % dataset) # at time of writing, rpy2's R dataframe to pandas dataframe was not fully supported # However, as python list() seems to work for FloatVector, StrVector, and FactorVector, let's use it from rpy2.robjects import r colnames = r.colnames(data) pandas_data = {} # convert each column of the R dataframe in turn for i, colname in enumerate(colnames): pandas_data[colname] = list(data[i]) # Unfortunately, some datasets have rownames that should be an ID column (e.g., see mtcars where rownames=names of the cars). # This is the best we can do: pull it out as an additional column for each and every dataset pandas_data["row_names"] = list(data.rownames) df = pd.DataFrame(pandas_data) data_object.add(self, df) terminate = df.empty return data_object, terminate
def _parse_assayData(assayData, assay): """Parse Rpy2 assayData (Environment object) assayData: Rpy2 Environment object. assay: An assay name indicating the data to be loaded. Return a parsed expression dataframe (Pandas). """ pandas2ri.activate() mat = assayData[assay] # rpy2 expression matrix object data = pandas2ri.ri2py(mat) features = pandas2ri.ri2py(r.rownames(mat)) samples = pandas2ri.ri2py(r.colnames(mat)) return pd.DataFrame(data, index=features, columns=samples)
def parstat(self): r_obj = r['ParStat'] obj = array(r_obj).T colnames = r.colnames(r_obj) properties = [ "xopt", "x1per", "x99per", "x10per", "x90per", "xminus", "xplus", "mode", "mean", "sd", "skewness", "kurtosis" ] values = [dict(zip(properties, obj[i])) for i in range(len(colnames))] matrix = [ [""] + properties, ] for i, param in enumerate(colnames): temp = [ param, ] for prop in properties: temp.append(values[i][prop]) matrix.append(temp) return matrix
# Extract required arguments. data = pd.read_table(snakemake.input.data, index_col=0) # Input Gene-by-Sample raw count data. condition = pd.read_table(snakemake.input.condition, index_col=0, names=['condition']) # Input condition file which indicates to which condition each sample belongs. logger.info('%d(genes) x %d(samples) data matrix and %d sample conditions are given.' % (data.shape[0], data.shape[1], len(condition.index))) logger.debug('Headers: %s...' % ' '.join(data.columns[:3])) logger.debug('Gene identifiers: %s...' % ' '.join(data.index[:3])) intersecting_samples = [sample for sample in data.columns if sample in condition.index] data = data[intersecting_samples] condition = list(condition.loc[intersecting_samples].condition.values) logger.info('%d samples will be used for DEG discovery.' % len(intersecting_samples)) r_data_matrix = r['data.matrix'](pandas2ri.py2ri(data)) r_samples = r.colnames(r_data_matrix) r_conditions = ro.FactorVector(condition) logger.debug('Computing size factors.') r_size_factors = ebseq.MedianNorm(r_data_matrix) logger.info('Discovering DEGs.') logger.info('Running EBTest.') num_iteration = 0 while True: # Increase iteration numbers if the conditons are not met. # Hopefully most of the tie, 10 iterations will be enough for convergence. num_iteration += 10 r_eb_out = ebseq.EBTest(Data=r_data_matrix, Conditions=r_conditions, sizeFactors=r_size_factors, maxround=num_iteration) logger.info('Running GetDEResults. (FDR cutoff = %.3f)' % cutoff)