示例#1
0
def createGraphSeries(cohort):
   hm = ['bothR5','bothDX','esR5','esDX','trofileR5','trofileDX','mixedR5DX']
   stat_data = DataFrame.from_csvfile(cohort+'.seq.out', sep = "\t")
   
   # Begin witing graphs to pdf
   grdevices.pdf(file=cohort+".stats.pdf",width=7,height=7)
   
   # graph 1 scatter plot
   scatterPlot(stat_data,'bothR5','bothDX',cohort.upper()+' cohort\n bothR5/bothDX correlation')
   scatterPlot(stat_data,'esR5','esDX',cohort.upper()+' cohort\n esR5/esDX correlation')
   scatterPlot(stat_data,'trofileR5','trofileDX',cohort.upper()+' cohort\n trofileR5/trofileDX correlation')
   scatterPlot(stat_data,'bothR5','esR5',cohort.upper()+' cohort\n bothR5/esR5 correlation')
   scatterPlot(stat_data,'trofileR5','esR5',cohort.upper()+' cohort\n trofileR5/esR5 correlation')
   scatterPlot(stat_data,'bothDX','esDX',cohort.upper()+' cohort\n bothDX/esDX correlation')
   scatterPlot(stat_data,'trofileDX','esDX',cohort.upper()+' cohort\n trofileDX/esDX correlation')
   scatterPlot(stat_data,'bothDX','mixedR5DX',cohort.upper()+' cohort\n bothDX/mixedR5DX correlation')

   for hmm in hm:
      hmm_scores = DataFrame.from_csvfile(cohort+'.seq.'+hmm+'.stats',sep="\t")
      # graph 3
      plot(hmm_scores.rx2("Cutoff"),hmm_scores.rx2("Accuracy"),type='o',main='Accuracy vs. Cutoff '+cohort.upper()+' Cohort\n'+hmm+'.hmm',xlab='Cutoff',ylab='Accuracy')
      # graph 4
      plot(hmm_scores.rx2("FPP"),hmm_scores.rx2("TPP"),type='o',xlim=base.c(0,1),ylim=base.c(0,1),main=cohort.upper()+' Cohort ROC\n'+hmm+'.hmm',xlab='% False Pos',ylab='% False Neg')
      # graph 5
      plot(hmm_scores.rx2("Cutoff"),hmm_scores.rx2("Phi"),type='o',xlim=base.c(50,100),ylim=base.c(0,0.75),main=cohort.upper()+' Cohort Association Coeff\n'+hmm+'.hmm',xlab='Cutoff',ylab='Association Coefficient')
   # close pdf file
   grdevices.dev_off()
   return
示例#2
0
def r_c50(rdf: RDataFrame, target: str, predictors: List[str]) -> RListVector:
    """
    Wrapper function around the C5.0 classifier.

    Note: The target column must be a factor vector.
    TODO: Training control and other parameters.
    """
    predictor_slice = rdf.rx(r_c(*predictors))
    target_slice = rdf.rx2(r_c(target))

    return C50.C5_0(predictor_slice, target_slice)
示例#3
0
文件: cimpl.py 项目: rajithbt/pyim
    def _extract_mapping(self, cimpl_obj, cis_sites):
        # Convert CIS sites to frame format.
        cis_frame = CisSite.to_frame(cis_sites)

        # Convert to R representation for cimpl.
        chr_with_prefix = add_prefix(cis_frame['chromosome'], prefix='chr')

        r_base = importr('base')
        cis_frame_r = RDataFrame({
            'id':
            r_base.I(StrVector(cis_frame['id'])),
            'chromosome':
            r_base.I(StrVector(chr_with_prefix)),
            'scale':
            StrVector(cis_frame['scale']),
            'start':
            IntVector(cis_frame['start']),
            'end':
            IntVector(cis_frame['end'])
        })
        cis_frame_r.rownames = StrVector(cis_frame['id'])

        # Retrieve cis matrix from cimpl.
        cis_matrix_r = self._cimpl.getCISMatrix(cimpl_obj, cis_frame_r)
        cis_matrix = dataframe_to_pandas(cis_matrix_r)

        # Extract scale information from cis matrix.
        scale_cols = [c for c in cis_matrix.columns if c.startswith('X')]
        cis_matrix_scales = cis_matrix[['id'] + scale_cols]

        # Melt matrix into long format.
        mapping = pd.melt(cis_matrix_scales, id_vars=['id'])
        mapping = mapping[['id', 'value']]
        mapping = mapping.rename(columns={
            'id': 'insertion_id',
            'value': 'cis_id'
        })

        # Split cis_id column into individual entries (for entries
        # with multiple ids). Then drop any empty rows, as these
        # entries are empty cells in the matrix.
        mapping = mapping.ix[mapping['cis_id'] != '']
        mapping = expand_column(mapping, col='cis_id', delimiter='|')

        mapping_dict = {
            ins_id: set(grp['cis_id'])
            for ins_id, grp in mapping.groupby('insertion_id')
        }

        return mapping_dict
示例#4
0
def clusterPop(admix, indexes):
    if len(indexes)==1:
        return [indexes[0]]
    subf = "adm.%d" % (os.getpid(),)
    w = open(subf, "w")
    f = open(admix)
    fPos = 0
    oldOrder = []
    for l in f:
        if fPos in indexes:
            w.write(l)
            oldOrder.append(fPos)
        fPos += 1
    f.close()
    w.close()
    df = DataFrame.from_csvfile(subf, sep=" ", header=False)
    d=robjects.r.hclust(robjects.r.dist(df))
    for name, value in d.items():
        if name=="order":
            order = value
            break
    os.remove(subf)
    reOrder = []
    for pos in order:
        reOrder.append(oldOrder[pos-1])
    return reOrder
示例#5
0
def clusterAll(admix, myPop):
    f = open(admix)
    ls = f.readlines()
    f.close()
    numK = len(ls[0].split(" "))
    pops = {}
    for i in range(len(ls)):
        vals = [float(x) for x in ls[i].rstrip().split(" ")]
        cnt, accu = pops.get(myPop[i], (0, [0.0]*numK))
        cnt += 1
        for i2 in range(numK): accu[i2] += vals[i2]
        pops[myPop[i]] = cnt, accu
    popNames = list(pops.keys())
    popNames.sort()
    subf = "accu.%d" % (os.getpid(),)
    w = open(subf, "w")
    for popName in popNames:
        myVals = pops[popName][1]
        cnt = pops[popName][0]
        myVals = [x/cnt for x in myVals]
        w.write(" ".join([str(x) for x in myVals]))
        w.write("\n")
    w.close()
    df = DataFrame.from_csvfile(subf, sep=" ", header=False)
    d=robjects.r.hclust(robjects.r.dist(df, method="max"), method="complete")
    for name, value in d.items():
        if name=="order":
            order = value
            break
    os.remove(subf)
    popOrder = []
    for pos in order:
        popOrder.append(popNames[pos-1])
    return popOrder
示例#6
0
def createModel(data):
    print 'Create_model'
    importr('forecast')
    robj.r('''
            arima_data <- function(data){
               
               best_arima = auto.arima(data,trace=F,stepwise=T)
               forecast = forecast.Arima(best_arima,h=60,level=c(99.5))
               output = forecast$mean
               return (output)
               
               
            }
            
   ''')
    features_names = ["Ret_%d_pred" % (i) for i in range(121, 181)]
    predict = pd.DataFrame(columns=features_names)
    i = 1
    for tmp in DataFrame.iter_row(data):
        if i % 100 == 0:
            print i
        tmp = robj.r("as.numeric")(tmp)
        #tmp = robj.r('ts')(tmp,start=2)
        tmp = robj.r('ts')(tmp, start=2, frequency=15)
        forecast = robj.r('arima_data')(tmp)
        forecast = robj.r('as.numeric')(forecast)
        forecast = np.array(forecast)
        predict2 = pd.DataFrame(forecast).T
        predict2.columns = features_names
        predict = pd.concat([predict, predict2], axis=0)
        i = i + 1
    print predict
    predict.to_csv("tmp1.csv")
示例#7
0
    def loadfiles(self):
        """
        Load files into R environment
        """
        rcount = 0
        names = robjects.r['names']
        
        # Set the default parameter for reading from csv
        param = {'sep': '\t', 'header': True, 'as_is': True,
                 'row.names': ri.NULL}


        # Check the correct parameter and set the default        
        for p in param.keys():
            if p in self.param:
                if self.param[p] is not None:
                    param[p] = self.param[p]
        
        self.param.update(param)

        # Read all the files in the R-environment
        for f, s in zip(self.filelist, self.seplist):
            try:
                tmpdata = DataFrame.from_csvfile(f,
                                                 sep=str(s),
                                                 header=param['header'],
                                                 as_is=param['as_is'],
                                                 row_names=param['row.names'])
                self.mylist.append(tmpdata)
                fdir, fname = os.path.split(os.path.splitext(f)[0])
                self.listname.append(fname)
                rcount += 1
            except IOError, e:
                self.error += e
示例#8
0
 def hae_ennakkoilmoitukset(tiedosto):
     if not os.path.exists(tiedosto):
         tiedosto = os.path.join(DATADIR, tiedosto)
         if not os.path.exists(tiedosto):
             raise IOError("Annettua tiedostoa %s ei löydy" % tiedosto)
     return DataFrame.from_csvfile(tiedosto, header=True, sep=',', 
                                   as_is=True)
示例#9
0
文件: meld.py 项目: Shotgunosine/MELD
    def __init__(self,
                 formula_str,
                 df,
                 factors=None,
                 resid_formula_str=None,
                 **lmer_opts):
        """
        """
        # get the pred_var
        pred_var = formula_str.split('~')[0].strip()

        # convert df to a recarray if it's a dataframe
        if isinstance(df, pd.DataFrame):
            df = df.to_records()

        # add column if necessary
        if pred_var not in df.dtype.names:
            # must add it
            df = append_fields(df, pred_var, [0.0] * len(df), usemask=False)

        # make factor list if necessary
        if factors is None:
            factors = {}
        # add in missingarg for any potential factor not provided
        for k in df.dtype.names:
            if isinstance(df[k][0], str) and k not in factors:
                factors[k] = MissingArg

        for f in factors:
            if factors[f] is None:
                factors[f] = MissingArg
            # checking for both types of R Vectors for rpy2 variations
            elif (not isinstance(factors[f], Vector)
                  and not factors[f] == MissingArg):
                factors[f] = Vector(factors[f])

        # convert the recarray to a DataFrame (releveling if desired)
        self._rdf = DataFrame({
            k: (FactorVector(df[k], levels=factors[k]) if
                (k in factors) or isinstance(df[k][0], str) else df[k])
            for k in df.dtype.names
        })

        # get the column index
        self._col_ind = list(self._rdf.colnames).index(pred_var)

        # make a formula obj
        self._rformula = Formula(formula_str)

        # make one for resid if necessary
        if resid_formula_str:
            self._rformula_resid = Formula(resid_formula_str)
        else:
            self._rformula_resid = None

        # save the args
        self._lmer_opts = lmer_opts

        # model is null to start
        self._ms = None
def analyzeR():
    mypatchData = DataFrame.from_csvfile(inputfile,header=True, sep = "\t")
    
    ici = list(mypatchData.colnames).index('ic50_val')
    starti = ici + 1
    lasti = mypatchData.nrow
    myData = mypatchData[,ici:lasti]
示例#11
0
文件: pandas2ri.py 项目: theflow/rpy2
def py2ri_pandasdataframe(obj):
    od = OrderedDict()
    for name, values in obj.iteritems():
        if values.dtype.kind == 'O':
            od[name] = StrVector(values)
        else:
            od[name] = conversion.py2ri(values)
    return DataFrame(od)
示例#12
0
文件: pandas2ri.py 项目: rs2/rpy2
def py2rpy_pandasdataframe(obj):
    od = OrderedDict()
    for name, values in obj.iteritems():
        try:
            od[name] = conversion.py2rpy(values)
        except Exception as e:
            warnings.warn('Error while trying to convert '
                          'the column "%s". Fall back to string conversion. '
                          'The error is: %s' % (name, str(e)))
            od[name] = StrVector(values)

    return DataFrame(od)
示例#13
0
def r_dataframe_subset_one_element(rdf: RDataFrame, n: int) -> RDataFrame:
    """
    Creates a dataframe with one column from the given dataframe and index.

    See:
        https://github.com/topepo/caret/issues/672
        https://stackoverflow.com/questions/40505994/how-to-apply-preprocessing-in-carets-train-to-only-some-variables
        https://stackoverflow.com/questions/31497479/how-to-select-columns-from-r-dataframe-in-rpy2-in-python
    """
    return r('data.frame')(rdf.rx(RIntVector([
        n,
    ])))
示例#14
0
def createGraphSeries(cohort,t,sfiles,ofile):
   hm = ['bothR5','bothDX','esR5','esDX','trofileR5','trofileDX','mixedR5DX']
   
   # Begin witing graphs to pdf
   grdevices.pdf(file=cohort+".stats.pdf",width=7,height=7)
   
   if t == 'both' or t =='out':
      stat_data = DataFrame.from_csvfile(ofile, sep = "\t")
      # graph 1 scatter plot
      scatterPlot(stat_data,'bothR5','bothDX',cohort.upper()+' cohort\n bothR5/bothDX correlation')
      scatterPlot(stat_data,'esR5','esDX',cohort.upper()+' cohort\n esR5/esDX correlation')
      scatterPlot(stat_data,'trofileR5','trofileDX',cohort.upper()+' cohort\n trofileR5/trofileDX correlation')
      scatterPlot(stat_data,'bothR5','esR5',cohort.upper()+' cohort\n bothR5/esR5 correlation')
      scatterPlot(stat_data,'trofileR5','esR5',cohort.upper()+' cohort\n trofileR5/esR5 correlation')
      scatterPlot(stat_data,'bothDX','esDX',cohort.upper()+' cohort\n bothDX/esDX correlation')
      scatterPlot(stat_data,'trofileDX','esDX',cohort.upper()+' cohort\n trofileDX/esDX correlation')
      scatterPlot(stat_data,'bothDX','mixedR5DX',cohort.upper()+' cohort\n bothDX/mixedR5DX correlation')

   if t == 'both' or t =='stats':
      for hmm in hm:
         if sfiles.has_key(hmm):
            f = sfiles[hmm]
            hmm_scores = DataFrame.from_csvfile(f,sep="\t")
            # graph 3
            plot(hmm_scores.rx2("Cutoff"),hmm_scores.rx2("Accuracy"),type='o',main='Accuracy vs. Cutoff '+cohort.upper()+' Cohort\n'+hmm+'.hmm',xlab='Cutoff',ylab='Accuracy')
            # graph 4
            plot(hmm_scores.rx2("FPP"),hmm_scores.rx2("TPP"),type='o',xlim=base.c(0,1),ylim=base.c(0,1),main=cohort.upper()+' Cohort ROC\n'+hmm+'.hmm',xlab='% False Pos',ylab='% False Neg')
            # graph 5
            plot(hmm_scores.rx2("Cutoff"),hmm_scores.rx2("Phi"),type='o',xlim=base.c(50,100),ylim=base.c(0,0.75),main=cohort.upper()+' Cohort Association Coeff\n'+hmm+'.hmm',xlab='Cutoff',ylab='Association Coefficient')
            # graph 6
            plot(hmm_scores.rx2("Cutoff"),hmm_scores.rx2("Specificity"),type='o',main='Specificity vs. Cutoff '+cohort.upper()+' Cohort\n'+hmm+'.hmm',xlab='Cutoff',ylab='Specificity')
            # graph 7
            plot(hmm_scores.rx2("Cutoff"),hmm_scores.rx2("Sensitivity"),type='o',main='Sensitivity vs. Cutoff '+cohort.upper()+' Cohort\n'+hmm+'.hmm',xlab='Cutoff',ylab='Sensitivity')
            # graph 8
            plot(hmm_scores.rx2("Sensitivity"),hmm_scores.rx2("Specificity"),type='o',main='Sensitivity vs. Specificity '+cohort.upper()+' Cohort\n'+hmm+'.hmm',xlab='Sensitivity',ylab='Specificity')
   
   # close pdf file
   grdevices.dev_off()
   return
示例#15
0
def r_dataframe_column_to_factor_by_name(rdf: RDataFrame,
                                         name: str) -> RDataFrame:
    """
    Transform the column with the given name into a factor vector.

    Note: This modifies the passed dataframe.
    """
    for index, item in enumerate(r_dataframe_column_names(rdf)):
        if item == name:
            rdf[index] = RFactorVector(RFactorVector(rdf.rx2(name)))
            return rdf

    raise ValueError('Given name is not in R dataframe')
示例#16
0
def as_dataframe(table):
    '''returns a DataFrame instance. Requires counts to be
    [[col1, col2, col3, ..]]'''
    data = dict(list(zip(table.header, list(zip(*table.tolist())))))
    for column in data:
        if type(data[column][0]) in (str, str):
            klass = StrVector
        else:
            klass = IntVector

        data[column] = klass(data[column])

    return DataFrame(data)
示例#17
0
def createModel(data, param):
    print 'Create_model'
    importr('forecast')
    robj.r('''
            arima_data <- function(data){
               
               best_arima = auto.arima(data,trace=F,stepwise=T,max.P=8,max.Q=8,max.p=10,max.q=10,max.order=10,
               ,start.p=1,start.q=0,start.P=1,start.Q=0,seasonal=T,ic=('bic'))
               forecast = forecast.Arima(best_arima,h=60,level=c(99.5),stationary=T)
               output = forecast$mean
               return (output)
               
               
            }
            
   ''')
    print 'the frequency is %d' % (param['frequency'])
    features_names = ["Ret_%d_pred" % (i) for i in range(121, 181)]
    predict = pd.DataFrame(columns=features_names)
    i = 1
    for tmp in DataFrame.iter_row(data):
        if i % 100 == 0:
            print i
        tmp = robj.r("as.numeric")(tmp)
        #tmp = robj.r('ts')(tmp,start=2)
        tmp = robj.r('ts')(tmp, start=2, frequency=param['frequency'])
        forecast = robj.r('arima_data')(tmp)
        forecast = robj.r('as.numeric')(forecast)
        forecast = np.array(forecast)
        predict2 = pd.DataFrame(forecast).T
        predict2.columns = features_names
        predict = pd.concat([predict, predict2], axis=0)
        i = i + 1

    #this way I will get forecast_data , train_data
    raw_data.to_csv("raw.csv")
    predict.to_csv("predict.csv")
    data = predict.join(raw_data, rsuffix='_2')
    data.to_csv("data.raw.csv")
    data['Ret_120_price'] = price_train['Ret_120_price']
    transform_format(data)
    Ret_1 = data['Ret_MinusTwo']
    Ret_2 = 1 - (1.0 / ((1.0 / (1 - data['Ret_MinusOne'])) *
                        data['Ret_120_price'] * data['Ret_180_price']))
    data['Ret_PlusOne_pred'] = 0.5 * Ret_1 + 0.5 * Ret_2
    data['Ret_PlusTwo_pred'] = 0.5 * Ret_2 + 0.5 * data['Ret_PlusOne_pred']
    data.to_csv("data.csv")
    WMAE_model(data)
    mase = np.sum(data['error']) / (40000 * 62)
    print 'loss:%f' % (mase)
    return {'loss': mase, 'status': STATUS_OK}
示例#18
0
    def loadfiles(self):
        """
        Load files into R environment
        """
        rcount = 0
        asmatrix = robjects.r['as.matrix']
        diag = robjects.r['diag']
        names = robjects.r['names']
        
        ## Set the default parameter for reading from csv
        param = {'header': True, 'as_is': True, 'row.names': ri.RNULLArg}
        ## Check the correct parameter and set the default
        for p in param.keys():
            if p in self.param:
                if self.param[p] is not None:
                    param[p] = self.param[p]
        for f, s in zip(self.filelist, self.seplist):
            try:
                dataf = DataFrame.from_csvfile(f,
                                               sep=str(s),
                                               header=param['header'],
                                               as_is=param['as_is'],
                                               row_names=param['row.names'])

                dataf = asmatrix(dataf)

                # Should be the diagonal set to 0?
                # Do it for all the inputs, just to be sure
                zcount = 0
                for i in xrange(dataf.ncol):
                    if (dataf.rx(i+1,i+1)[0] - 0.0 >= 1e-8):
                        zcount += 1
                        dataf.rx[i+1,i+1] = 0

                if zcount:
                    self.e += f
                    
                self.mylist.append(dataf)
                fdir, fname = os.path.split(os.path.splitext(f)[0])
                self.listname.append(fname)

                rcount += 1
            except IOError, e:
                self.error += e
            
            except RRuntimeError, e:
                self.error += e
示例#19
0
def stepwise_regression(data, d_v, i_vs):
#    __file = tempfile.NamedTemporaryFile(delete=False)
    __file = open('/home/foodfan/haha','wb')
    __file.writelines(data)
    stats = importr('stats')
    pat = '%s~%s' % (d_v, '+'.join(i_vs))
    print pat
#    return None
    __file.close()
    data_from_input = DataFrame.from_csvfile(__file.name)
    reg = stats.lm(pat, data_from_input)
    st = stats.step(reg, direction = 'backward')
    ret = str(st[0])
#    print '------------------------------------------------------------'
#    
#    print '------------------------------------------------------------'
    return ret
示例#20
0
 def read_data(self, file_name, col_from, col_to):
     RawData = DataFrame(excel.read_excel("../Data/" + file_name + '.xlsx'))
     NumericData = RawData.rx(True, IntVector(range(col_from, col_to + 1)))
     MetaData = RawData.rx(True, col_from - 1)[0]
     RawData._set_rownames(IntVector(range(1, len(MetaData) + 1)))
     self.file_name = file_name
     self.raw_data = RawData
     #print(self.raw_data)
     self.numeric_data = NumericData
     self.metadata = r_base.factor(MetaData)
     self.metadata_list = list(MetaData)
     self.metabolite_list = list(self.raw_data.names)[1:]
     self.make_metabolite_dict()
示例#21
0
def pandas2ri(obj):
    if isinstance(obj, PandasDataFrame):
        od = OrderedDict()
        for name, values in obj.iteritems():
            if values.dtype.kind == 'O':
                od[name] = StrVector(values)
            else:
                od[name] = pandas2ri(values)
        return DataFrame(od)
    elif isinstance(obj, PandasIndex):
        if obj.dtype.kind == 'O':
            return StrVector(obj)
        else:
            # only other alternative to 'O' is integer, I think,
            # which goes straight to the numpy converter.
            return numpy2ri.numpy2ri(obj)
    elif isinstance(obj, PandasSeries):
        if obj.dtype == '<M8[ns]':
            # time series
            d = [
                IntVector([x.year for x in obj]),
                IntVector([x.month for x in obj]),
                IntVector([x.day for x in obj]),
                IntVector([x.hour for x in obj]),
                IntVector([x.minute for x in obj]),
                IntVector([x.second for x in obj])
            ]
            res = ISOdatetime(*d)
            #FIXME: can the POSIXct be created from the POSIXct constructor ?
            # (is '<M8[ns]' mapping to Python datetime.datetime ?)
            res = POSIXct(res)
        else:
            # converted as a numpy array
            res = numpy2ri.numpy2ri(obj.values)
        # "index" is equivalent to "names" in R
        if obj.ndim == 1:
            res.do_slot_assign('names', ListVector({'x':
                                                    pandas2ri(obj.index)}))
        else:
            res.do_slot_assign('dimnames', ListVector(pandas2ri(obj.index)))
        return res
    else:
        return original_py2ri(obj)
示例#22
0
def aov(matrix, factor_names, measure_name, robj, interactions = '+'):
    '''
    Computes a repeated measures anova in R via the 'aov' command.
    
    This function uses R's aov function. It does not compute 
    Greehnhouse-Geisser and Huynh-Feldt corrections. Use lm_anova
    for this.
    
    Input:
        matrix : ndarray
            Each dimension of the matrix corresponds to one factor.
            The first dimension must be (!) the number of subjects.
            The values in the matrix are taken as the dependent variable. 
        factor_names : list
            List with names of each factor. The ordering must correspond
            to the dimensions given by matrix.shape.
        measure_name : str
            Name of the dependnent variable.
        robj : rpy2.robjects instance
        interactions : str
            '+' for no interactions
            '*' for all interactions
    '''

    robj.r('rm(list = ls(all = TRUE)) ')
    df = make_data_frame(matrix, 
            factor_names, measure = measure_name)
    robj.globalenv['df'] = DataFrame(df)
    robj.r('attach(df)')
    formula = ''
    error = ''
    for factor in factor_names:
        robj.r('%s<-factor(df$%s)'%(factor,factor))
        formula = formula + interactions + factor
        error = error + '*' + factor
    formula,error  = formula[1:], error[1:]
    formula = 'aov.out <- aov(%s  ~ %s + Error(subject/(%s), data=df))'%(measure_name, formula, error)
    robj.r(formula)
    print(robj.r('summary(aov.out)'))
    robj.r('detach(df)')
示例#23
0
文件: rutils.py 项目: MPBA/renette
def csv2graph(csvfiles, seplist=[], param={},filepath='.', graph_format='gml'):
    """
    Utility to convert from csv file to igraph format file
    """
    
    igraph = importr('igraph')
    gadj = igraph.graph_adjacency
    wgraph = igraph.write_graph
    
    if len(seplist) != len(csvfiles):
        raise IOError('Not enought separators')
        
    for i,f in enumerate(csvfiles):
        myfname = f + ".%s" % format
        tmpdata = DataFrame.from_csvfile(f,
                                         sep=seplist[i],
                                         header=param['header'] if param.has_key('header') else True,
                                         as_is=True,
                                         row_names=param['row.names'] if param.has_key('row_names') else False)
        g = gadj(reslist, mode='undirected', weighted=True)
        wgraph(g, file=os.path.join(filepath,myfname), format=format)
        
    return True
示例#24
0
def r_formula(rdf: RDataFrame, target: str, predictors: List[str]) -> RFormula:
    """
    Creates an R modelling formula associated with the given dataframe.

    The produced string formula is 'predictor ~ var1 + var2 + etc...'
    """

    lhs_items = [target, '~']
    rhs_items = []

    for predictor in predictors:
        rhs_items.append(predictor)
        rhs_items.append('+')

    rhs_items = rhs_items[:-1]  # remove the last '+'
    all_items = lhs_items + rhs_items
    formula_string = ' '.join(all_items)

    formula = RFormula(formula_string)

    for predictor in predictors:
        formula.environment[predictor] = rdf.rx(predictor)

    return formula
distEisen = robjects.r('''
                       distEisen <- function(x, use = "pairwise.complete.obs") {
                       co.x <- cor(x, use = use)
                       dist.co.x <- 1 - co.x
                       return(as.dist(dist.co.x))
                       }
                       ''')

listToDF = robjects.r('''
           listToDF <- function(inputList, fill = NA){
               # Use fill = NULL for regular recycling behavior
               maxLen = max(sapply(inputList, length))
               for(i in seq_along(inputList))
                   inputList[[i]] <- c(inputList[[i]], rep(fill, maxLen -
           length(inputList[[i]])))
               return(as.data.frame(inputList))
           }
           ''')
annotations = DataFrame.from_csvfile(annotation_classes_input_file,
                                     header=True,
                                     sep='\t',
                                     quote='"',
                                     row_names=1)


R = robjects.r
R["library"]("utils")
R["library"]("tools")


示例#26
0
    def __init__(self,
                 fe_formula,
                 re_formula,
                 re_group,
                 dep_data,
                 ind_data,
                 factors=None,
                 row_mask=None,
                 dep_mask=None,
                 use_ranks=False,
                 use_norm=True,
                 memmap=False,
                 memmap_dir=None,
                 resid_formula=None,
                 svd_terms=None,
                 feat_thresh=0.05,
                 feat_nboot=1000,
                 do_tfce=False,
                 connectivity=None,
                 shape=None,
                 dt=.01,
                 E=2 / 3.,
                 H=2.0,
                 n_jobs=1,
                 verbose=10,
                 lmer_opts=None):
        """

        dep_data can be an array or a dict of arrays (possibly
        memmapped), one for each group.

        ind_data can be a rec_array for each group or one large rec_array
        with a grouping variable.

        """
        if verbose > 0:
            sys.stdout.write('Initializing...')
            sys.stdout.flush()
            start_time = time.time()

        # save the formula
        self._formula_str = fe_formula + ' + ' + re_formula

        # see if there's a resid formula
        if resid_formula:
            # the random effects are the same
            self._resid_formula_str = resid_formula + ' + ' + re_formula
        else:
            self._resid_formula_str = None

        # save whether using ranks
        self._use_ranks = use_ranks

        # see the thresh for keeping a feature
        self._feat_thresh = feat_thresh
        self._feat_nboot = feat_nboot
        self._do_tfce = do_tfce
        self._connectivity = connectivity
        self._dt = dt
        self._E = E
        self._H = H

        # see if memmapping
        self._memmap = memmap

        # save job info
        self._n_jobs = n_jobs
        self._verbose = verbose

        # eventually fill the feature shape
        self._feat_shape = None

        # handle the dep_mask
        self._dep_mask = dep_mask

        # fill A,M,O,D
        self._A = {}
        self._M = {}
        self._O = {}
        self._D = {}
        O = []

        # loop over unique grouping var
        self._re_group = re_group
        if isinstance(ind_data, dict):
            # groups are the keys
            self._groups = np.array(ind_data.keys())
        else:
            # groups need to be extracted from the recarray
            self._groups = np.unique(ind_data[re_group])
        for g in self._groups:
            # get that subj inds
            if isinstance(ind_data, dict):
                # the index is just the group into that dict
                ind_ind = g
            else:
                # select the rows based on the group
                ind_ind = ind_data[re_group] == g

            # process the row mask
            if row_mask is None:
                # no mask, so all good
                row_ind = np.ones(len(ind_data[ind_ind]), dtype=np.bool)
            elif isinstance(row_mask, dict):
                # pull the row_mask from the dict
                row_ind = row_mask[g]
            else:
                # index into it with ind_ind
                row_ind = row_mask[ind_ind]

            # extract that group's A,M,O
            # first save the observations (rows of A)
            self._O[g] = ind_data[ind_ind][row_ind]
            if use_ranks:
                # loop over non-factors and rank them
                for n in self._O[g].dtype.names:
                    if (n in factors) or isinstance(self._O[g][n][0], str):
                        continue
                    self._O[g][n] = rankdata(self._O[g][n])
            O.append(self._O[g])

            # eventually allow for dict of data files for dep_data
            if isinstance(dep_data, dict):
                # the index is just the group into that dict
                dep_ind = g
            else:
                # select the rows based on the group
                dep_ind = ind_ind

            # save feature shape if necessary
            if self._feat_shape is None:
                self._feat_shape = dep_data[dep_ind].shape[1:]

            # handle the mask
            if self._dep_mask is None:
                self._dep_mask = np.ones(self._feat_shape, dtype=np.bool)

            # create the connectivity (will mask later)
            if self._do_tfce and self._connectivity is None and \
               (len(self._dep_mask.flatten()) > self._dep_mask.sum()):
                # create the connectivity
                self._connectivity = cluster.sparse_dim_connectivity(
                    [cluster.simple_neighbors_1d(n) for n in self._feat_shape])

            # Save D index into data (apply row and feature masks
            # This will also reshape it
            self._D[g] = dep_data[dep_ind][row_ind][:, self._dep_mask].copy()

            # reshape it
            #self._D[g] = self._D[g].reshape((self._D[g].shape[0], -1))
            if use_ranks:
                if verbose > 0:
                    sys.stdout.write('Ranking %s...' % (str(g)))
                    sys.stdout.flush()

                for i in xrange(self._D[g].shape[1]):
                    # rank it
                    self._D[g][:, i] = rankdata(self._D[g][:, i])

                    # normalize it
                    self._D[g][:, i] = ((self._D[g][:, i] - 1) /
                                        (len(self._D[g][:, i]) - 1))

            # save M from D so we can have a normalized version
            self._M[g] = self._D[g].copy()

            # remove any NaN's in dep_data
            self._D[g][np.isnan(self._D[g])] = 0.0

            # normalize M
            if use_norm:
                self._M[g] -= self._M[g].mean(0)
                self._M[g] /= np.sqrt((self._M[g]**2).sum(0))

            # determine A from the model.matrix
            rdf = DataFrame({
                k: (FactorVector(self._O[g][k])
                    if k in factors else self._O[g][k])
                for k in self._O[g].dtype.names
            })

            # model spec as data frame
            ms = r['data.frame'](r_model_matrix(Formula(fe_formula), data=rdf))

            cols = list(r['names'](ms))
            if svd_terms is None:
                self._svd_terms = [c for c in cols if 'Intercept' not in c]
            else:
                self._svd_terms = svd_terms

            # self._A[g] = np.vstack([ms[c] #np.array(ms.rx(c))
            self._A[g] = np.concatenate(
                [np.array(ms.rx(c)) for c in self._svd_terms]).T

            if use_ranks:
                for i in xrange(self._A[g].shape[1]):
                    # rank it
                    self._A[g][:, i] = rankdata(self._A[g][:, i])

                    # normalize it
                    self._A[g][:, i] = ((self._A[g][:, i] - 1) /
                                        (len(self._A[g][:, i]) - 1))

            # normalize A
            if True:  # use_norm:
                self._A[g] -= self._A[g].mean(0)
                self._A[g] /= np.sqrt((self._A[g]**2).sum(0))

            # memmap if desired
            if self._memmap:
                self._M[g] = _memmap_array(self._M[g],
                                           memmap_dir,
                                           unique_id=str(g))
                self._D[g] = _memmap_array(self._D[g],
                                           memmap_dir,
                                           unique_id=str(g))

        # save the new O
        self._O = O
        if lmer_opts is None:
            lmer_opts = {}
        self._lmer_opts = lmer_opts
        self._factors = factors

        # mask the connectivity
        if self._do_tfce and (len(self._dep_mask.flatten()) >
                              self._dep_mask.sum()):
            self._connectivity = self._connectivity.tolil()[
                self._dep_mask.flatten()][:, self._dep_mask.flatten()].tocoo()

        # prepare for the perms and boots and jackknife
        self._perms = []
        self._tp = []
        self._tb = []
        self._tj = []
        self._pfmask = []

        if verbose > 0:
            sys.stdout.write('Done (%.2g sec)\n' % (time.time() - start_time))
            sys.stdout.write('Processing actual data...')
            sys.stdout.flush()
            start_time = time.time()

        global _global_meld
        _global_meld[id(self)] = self

        # run for actual data (returns both perm and boot vals)
        self._R = None
        self._ss = None
        self._mer = None
        tp, tb, R, feat_mask, ss, mer = _eval_model(id(self), None)
        self._R = R
        self._tp.append(tp)
        self._tb.append(tb)
        self._feat_mask = feat_mask
        self._fmask = ~feat_mask[0]
        self._pfmask.append(~feat_mask[0])
        self._ss = ss
        self._mer = mer

        if verbose > 0:
            sys.stdout.write('Done (%.2g sec)\n' % (time.time() - start_time))
            sys.stdout.flush()
示例#27
0
    def __init__(self, dex_name):

        self.dexcom_data = DataFrame.from_csvfile(dex_name)
示例#28
0
 def from_csv(cls, data):
     return cls(DataFrame.from_csvfile(str(data)))
示例#29
0
def test_image_png():
    dataf = DataFrame({'x': 1, 'Y': 2})
    g = rpy2.robjects.lib.ggplot2.ggplot(dataf)
    img = ggplot.image_png(g)
    assert img
示例#30
0
from rpy2.robjects.vectors import DataFrame
import math
import datetime

parser = argparse.ArgumentParser()


parser.add_argument("-in_csv", help="")
parser.add_argument("-out", help="")

args=parser.parse_args()

infile="/Users/security/science/bigoutput.csv"


dataf = DataFrame.from_csvfile(infile, sep = ",",header=True)

# Get statistics for investigated seqs
#rmean = robjects.r['mean']
#rmed = robjects.r['median']
#rmax = robjects.r['max']
#rsd = robjects.r['sd']
#rsum = robjects.r['sum']
#
#ma=rmax(dataf.rx('Length'))
#
#as_vec = robjects.r['as.vector']
#as_num = robjects.r['as.numeric']
#as_mat = robjects.r['as.matrix']
#
#test22=as_vec(dataf.rx('Length'))
示例#31
0
##    print pat
##    return None
#    __file.close()
#    data_from_input = DataFrame.from_csvfile(__file.name)
#    reg = stats.lm(pat, data_from_input)
#    st = stats.step(reg, direction = 'backward')
##    ret = ''
##    for key, value in st.iteritems():
##        ret += key + ',' + str(value) + '\n'
##    return ret
#    return ''

# stepwise_regression()
d_v = "inflat"
i_vs = ["money", "output", "initial", "poprate", "inv", "school"]
data_from_input = DataFrame.from_csvfile("/home/foodfan/money.csv")
stats = importr("stats")
pat = "%s~%s" % (d_v, "+".join(i_vs))
#    print pat
#    return None
reg = stats.lm(pat, data_from_input)
st = stats.step(reg, direction="backward")
ret = ""
print "------------------------------------"

print str(st[0])
# print st[1]

# for key, value in st.iteritems():
#    ret += key + ',' + str(value) + '\n'
print ret
示例#32
0
def main():
    try:
        datafile = sys.argv[1]
        outfile_name = sys.argv[2]
        expression = sys.argv[3]
    except: 
        stop_err( 'Usage: python gsummary.py input_file ouput_file expression' )

    math_allowed = S3_METHODS()[ 'Math' ]
    ops_allowed = S3_METHODS()[ 'Ops' ]

    # Check for invalid expressions
    for word in re.compile( '[a-zA-Z]+' ).findall( expression ):
        if word and not word in math_allowed: 
            stop_err( "Invalid expression '%s': term '%s' is not recognized or allowed" %( expression, word ) )
    symbols = set()
    for symbol in re.compile( '[^a-z0-9\s]+' ).findall( expression ):
        if symbol and not symbol in ops_allowed:
            stop_err( "Invalid expression '%s': operator '%s' is not recognized or allowed" % ( expression, symbol ) )
        else:
            symbols.add( symbol )
    if len( symbols ) == 1 and ',' in symbols:
        # User may have entered a comma-separated list r_data_frame columns
        stop_err( "Invalid columns '%s': this tool requires a single column or expression" % expression )

    # Find all column references in the expression
    cols = []
    for col in re.compile( 'c[0-9]+' ).findall( expression ):
        try:
            cols.append( int( col[1:] ) - 1 )
        except:
            pass
 
    tmp_file = tempfile.NamedTemporaryFile( 'w+b' )
    # Write the R header row to the temporary file
    hdr_str = "\t".join( "c%s" % str( col+1 ) for col in cols )
    tmp_file.write( "%s\n" % hdr_str )
    skipped_lines = 0
    first_invalid_line = 0
    i = 0
    for i, line in enumerate( file( datafile ) ):
        line = line.rstrip( '\r\n' )
        if line and not line.startswith( '#' ):
            valid = True
            fields = line.split( '\t' )
            # Write the R data row to the temporary file
            for col in cols:
                try:
                    float( fields[ col ] )
                except:
                    skipped_lines += 1
                    if not first_invalid_line:
                        first_invalid_line = i + 1
                    valid = False
                    break
            if valid:
                data_str = "\t".join( fields[ col ] for col in cols )
                tmp_file.write( "%s\n" % data_str )
    tmp_file.flush()

    if skipped_lines == i + 1:
        stop_err( "Invalid column or column data values invalid for computation.  See tool tips and syntax for data requirements." )
    else:
        # summary function and return labels
        summary_func = r( "function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }" )
        headings = [ 'sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%' ]
        headings_str = "\t".join( headings )
        
        #r.set_default_mode( NO_CONVERSION )
        #r_data_frame = r.read_table( tmp_file.name, header=True, sep="\t" )
        r_data_frame = DataFrame.from_csvfile( tmp_file.name, header=True, sep="\t" )
        
        outfile = open( outfile_name, 'w' )

        for col in re.compile( 'c[0-9]+' ).findall( expression ):
            r.assign( col, r[ "$" ]( r_data_frame, col ) )
        try:
            summary = summary_func( r( expression ) )
        except RException, s:
            outfile.close()
            stop_err( "Computation resulted in the following error: %s" % str( s ) )
        #summary = summary.as_py( BASIC_CONVERSION )
        outfile.write( "#%s\n" % headings_str )
        print summary
        print summary.r_repr()
        outfile.write( "%s\n" % "\t".join( [ "%g" % ( summary.rx2( k )[0] ) for k in headings ] ) )
        outfile.close()

        if skipped_lines:
            print "Skipped %d invalid lines beginning with line #%d.  See tool tips for data requirements." % ( skipped_lines, first_invalid_line )        
示例#33
0
    def __init__(
            self,
            fe_formula,
            re_formula,
            re_group,
            dep_data,
            ind_data,
            factors=None,
            row_mask=None,
            use_ranks=False,
            use_norm=True,
            memmap=False,
            memmap_dir=None,
            resid_formula=None,
            null_formula=None,
            num_null_boot=0,
            svd_terms=None,
            use_ssvd=False,
            #nperms=500, nboot=100,
            n_jobs=1,
            verbose=10,
            lmer_opts=None):
        """
        """
        if verbose > 0:
            sys.stdout.write('Initializing...')
            sys.stdout.flush()
            start_time = time.time()

        # save the formula
        self._formula_str = fe_formula + ' + ' + re_formula

        # see if there's a resid formula
        if resid_formula:
            # the random effects are the same
            self._resid_formula_str = resid_formula + ' + ' + re_formula
        else:
            self._resid_formula_str = None

        # see if there's a null formula
        if null_formula:
            # the random effects are the same
            self._null_formula_str = null_formula + ' + ' + re_formula
        else:
            self._null_formula_str = None
        self._num_null_boot = num_null_boot

        # save whether using ranks
        self._use_ranks = use_ranks

        # see whether to use sparse svd
        self._use_ssvd = use_ssvd

        # see if memmapping
        self._memmap = memmap

        # save job info
        self._n_jobs = n_jobs
        self._verbose = verbose

        # eventually fill the feature shape
        self._feat_shape = None

        # fill A,M,O,D
        self._A = {}
        self._M = {}
        self._O = {}
        self._D = {}
        O = []

        # loop over unique grouping var
        self._re_group = re_group
        if isinstance(ind_data, dict):
            # groups are the keys
            self._groups = np.array(list(ind_data.keys()))
        else:
            # groups need to be extracted from the recarray
            self._groups = np.unique(ind_data[re_group])
        for g in self._groups:
            # get that subj inds
            if isinstance(ind_data, dict):
                # the index is just the group into that dict
                ind_ind = g
            else:
                # select the rows based on the group
                ind_ind = ind_data[re_group] == g

            # process the row mask
            if row_mask is None:
                # no mask, so all good
                row_ind = np.ones(len(ind_data[ind_ind]), dtype=np.bool)
            elif isinstance(row_mask, dict):
                # pull the row_mask from the dict
                row_ind = row_mask[g]
            else:
                # index into it with ind_ind
                row_ind = row_mask[ind_ind]

            # extract that group's A,M,O
            # first save the observations (rows of A)
            self._O[g] = ind_data[ind_ind][row_ind]
            if use_ranks:
                # loop over non-factors and rank them
                for n in self._O[g].dtype.names:
                    if (n in factors) or isinstance(self._O[g][n][0], str):
                        continue
                    self._O[g][n] = rankdata(self._O[g][n])
            O.append(self._O[g])

            # eventually allow for dict of data files for dep_data
            if isinstance(dep_data, dict):
                # the index is just the group into that dict
                dep_ind = g
            else:
                # select the rows based on the group
                dep_ind = ind_ind

            # save feature shape if necessary
            if self._feat_shape is None:
                self._feat_shape = dep_data[dep_ind].shape[1:]

            # Save D index into data
            self._D[g] = dep_data[dep_ind][row_ind]
            # reshape it
            self._D[g] = self._D[g].reshape((self._D[g].shape[0], -1))
            if use_ranks:
                if verbose > 0:
                    sys.stdout.write('Ranking %s...' % (str(g)))
                    sys.stdout.flush()

                for i in range(self._D[g].shape[1]):
                    self._D[g][:, i] = rankdata(self._D[g][:, i])

            # reshape M, so we don't have to do it repeatedly
            self._M[g] = self._D[g].copy(
            )  #dep_data[ind].reshape((dep_data[ind].shape[0],-1))

            # normalize M
            if use_norm:
                self._M[g] -= self._M[g].mean(0)
                self._M[g] /= np.sqrt((self._M[g]**2).sum(0))

            # determine A from the model.matrix
            rdf = DataFrame({
                k: (FactorVector(self._O[g][k])
                    if k in factors else self._O[g][k])
                for k in self._O[g].dtype.names
            })

            # model spec as data frame
            ms = r['data.frame'](r_model_matrix(Formula(fe_formula), data=rdf))

            cols = list(r['names'](ms))
            if svd_terms is None:
                self._svd_terms = [c for c in cols if not 'Intercept' in c]
            else:
                self._svd_terms = svd_terms
            self._A[g] = np.concatenate(
                [np.array(ms.rx(c)) for c in self._svd_terms]).T
            #for c in cols if not 'Intercept' in c]).T

            if use_ranks:
                for i in range(self._A[g].shape[1]):
                    self._A[g][:, i] = rankdata(self._A[g][:, i])

            # normalize A
            if True:  #use_norm:
                self._A[g] -= self._A[g].mean(0)
                self._A[g] /= np.sqrt((self._A[g]**2).sum(0))

            # memmap if desired
            if self._memmap:
                self._M[g] = _memmap_array(self._M[g], memmap_dir)
                self._D[g] = _memmap_array(self._D[g], memmap_dir)

        # concat the Os together and make an LMER instance
        #O = np.concatenate(O)
        #self._O = np.vstack(O)
        #self._O = np.array(O)
        self._O = O
        if lmer_opts is None:
            lmer_opts = {}
        self._lmer_opts = lmer_opts
        self._factors = factors
        #self._lmer = LMER(self._formula_str, O, factors=factors, **lmer_opts)

        # prepare for the perms and boots
        self._perms = []
        self._boots = []
        self._tp = []
        self._tb = []

        if verbose > 0:
            sys.stdout.write('Done (%.2g sec)\n' % (time.time() - start_time))
            sys.stdout.write('Processing actual data...')
            sys.stdout.flush()
            start_time = time.time()

        global _global_meld
        _global_meld[id(self)] = self

        # run for actual data (returns both perm and boot vals)
        self._R = None
        self._ss = None
        self._mer = None
        self._mer_null = None
        tp, tb, R, feat_mask, ss, mer, mer_null = _eval_model(
            id(self), None, None)
        self._R = R
        self._tp.append(tp)
        self._tb.append(tb)
        self._feat_mask = feat_mask
        self._ss = ss
        self._mer = mer
        self._mer_null = mer_null

        if verbose > 0:
            sys.stdout.write('Done (%.2g sec)\n' % (time.time() - start_time))
            sys.stdout.flush()
示例#34
0
def main():
    try:
        datafile = sys.argv[1]
        outfile_name = sys.argv[2]
        expression = sys.argv[3]
    except:
        stop_err('Usage: python gsummary.py input_file ouput_file expression')

    math_allowed = S3_METHODS()['Math']
    ops_allowed = S3_METHODS()['Ops']

    # Check for invalid expressions
    for word in re.compile('[a-zA-Z]+').findall(expression):
        if word and not word in math_allowed:
            stop_err(
                "Invalid expression '%s': term '%s' is not recognized or allowed"
                % (expression, word))
    symbols = set()
    for symbol in re.compile('[^a-z0-9\s]+').findall(expression):
        if symbol and not symbol in ops_allowed:
            stop_err(
                "Invalid expression '%s': operator '%s' is not recognized or allowed"
                % (expression, symbol))
        else:
            symbols.add(symbol)
    if len(symbols) == 1 and ',' in symbols:
        # User may have entered a comma-separated list r_data_frame columns
        stop_err(
            "Invalid columns '%s': this tool requires a single column or expression"
            % expression)

    # Find all column references in the expression
    cols = []
    for col in re.compile('c[0-9]+').findall(expression):
        try:
            cols.append(int(col[1:]) - 1)
        except:
            pass

    tmp_file = tempfile.NamedTemporaryFile('w+b')
    # Write the R header row to the temporary file
    hdr_str = "\t".join("c%s" % str(col + 1) for col in cols)
    tmp_file.write("%s\n" % hdr_str)
    skipped_lines = 0
    first_invalid_line = 0
    i = 0
    for i, line in enumerate(file(datafile)):
        line = line.rstrip('\r\n')
        if line and not line.startswith('#'):
            valid = True
            fields = line.split('\t')
            # Write the R data row to the temporary file
            for col in cols:
                try:
                    float(fields[col])
                except:
                    skipped_lines += 1
                    if not first_invalid_line:
                        first_invalid_line = i + 1
                    valid = False
                    break
            if valid:
                data_str = "\t".join(fields[col] for col in cols)
                tmp_file.write("%s\n" % data_str)
    tmp_file.flush()

    if skipped_lines == i + 1:
        stop_err(
            "Invalid column or column data values invalid for computation.  See tool tips and syntax for data requirements."
        )
    else:
        # summary function and return labels
        summary_func = r(
            "function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }"
        )
        headings = ['sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%']
        headings_str = "\t".join(headings)

        #r.set_default_mode( NO_CONVERSION )
        #r_data_frame = r.read_table( tmp_file.name, header=True, sep="\t" )
        r_data_frame = DataFrame.from_csvfile(tmp_file.name,
                                              header=True,
                                              sep="\t")

        outfile = open(outfile_name, 'w')

        for col in re.compile('c[0-9]+').findall(expression):
            r.assign(col, r["$"](r_data_frame, col))
        try:
            summary = summary_func(r(expression))
        except RException, s:
            outfile.close()
            stop_err("Computation resulted in the following error: %s" %
                     str(s))
        #summary = summary.as_py( BASIC_CONVERSION )
        outfile.write("#%s\n" % headings_str)
        print summary
        print summary.r_repr()
        outfile.write(
            "%s\n" % "\t".join(["%g" % (summary.rx2(k)[0]) for k in headings]))
        outfile.close()

        if skipped_lines:
            print "Skipped %d invalid lines beginning with line #%d.  See tool tips for data requirements." % (
                skipped_lines, first_invalid_line)
示例#35
0
def show4():
	open4()
	r.source('D:/Postgraduate/Course/2-semester/R-language/TimeAnalyze/Programe/R/end.R',encoding="utf-8")
	data = DataFrame.from_csvfile('D:/Postgraduate/Course/2-semester/R-language/TimeAnalyze/Programe/temp/project2.csv')
	pp = ggplot2.ggplot(data)+ggplot2.aes_string(x='day', y='time',fill = 'factor(project)')+ggplot2.geom_bar(stat ='identity',position = 'dodge')+ggplot2.ggtitle("两项目时间对比图")+ggplot2.labs(x='日期',y='时间 (min)')+ggplot2.theme(**{'axis.text.x': ggplot2.element_text(angle = 45)})
	pp.plot()
示例#36
0
文件: pandas2ri.py 项目: rs2/rpy2
def rpy2py_listvector(obj):
    if 'data.frame' in obj.rclass:
        res = rpy2py(DataFrame(obj))
    else:
        res = numpy2ri.rpy2py(obj)
    return res
示例#37
0
    p.join()
    times_r.append(res)

from rpy2.robjects.vectors import DataFrame, FloatVector, StrVector, IntVector
d = {}
d['code'] = StrVector([x[0]
                       for x in combos]) + StrVector([x[0] for x in combos_r])
d['sequence'] = StrVector([x[-2] for x in combos]) + StrVector(
    [x[0] for x in combos_r])
d['time'] = FloatVector([x for x in times]) + FloatVector(
    [x[0] for x in combos_r])
d['n_loop'] = IntVector([x[-1] for x in combos]) + IntVector(
    [x[1] for x in combos_r])
d['group'] = StrVector(
    [d['code'][x] + ':' + d['sequence'][x] for x in xrange(len(d['n_loop']))])
dataf = DataFrame(d)

from rpy2.robjects.lib import ggplot2
p = ggplot2.ggplot(dataf) + \
    ggplot2.geom_line(ggplot2.aes_string(x="n_loop",
                                         y="time",
                                         colour="code")) + \
    ggplot2.geom_point(ggplot2.aes_string(x="n_loop",
                                          y="time",
                                          colour="code")) + \
    ggplot2.facet_wrap(Formula('~sequence')) + \
    ggplot2.scale_y_continuous('running time') + \
    ggplot2.scale_x_continuous('repeated n times', ) + \
    ggplot2.xlim(0, max(n_loops)) + \
    ggplot2.opts(title = "Benchmark (running time)")
示例#38
0
 def hae_ehdokkaat():
     in_tiedosto = os.path.join(DATADIR, 'e2011ehd.csv')
     return DataFrame.from_csvfile(in_tiedosto, header=True, sep='\t', 
                                   as_is=True)
示例#39
0

def _to_pandas_factor(obj):
    codes = [x - 1 if x > 0 else -1 for x in numpy.array(obj)]
    res = pandas.Categorical.from_codes(codes,
                                        categories=list(obj.do_slot('levels')),
                                        ordered='ordered' in obj.rclass)
    return res


converter._rpy2py_nc_map.update({
    rinterface.IntSexpVector:
    conversion.NameClassMap(numpy2ri.rpy2py, {'factor': _to_pandas_factor}),
    rinterface.ListSexpVector:
    conversion.NameClassMap(numpy2ri.rpy2py,
                            {'data.frame': lambda obj: rpy2py(DataFrame(obj))})
})


def activate():
    warnings.warn(
        'The global conversion available with activate() '
        'is deprecated and will be removed in the next '
        'major release. Use a local converter.',
        category=DeprecationWarning)
    global original_converter
    # If module is already activated, there is nothing to do.
    if original_converter is not None:
        return

    original_converter = conversion.Converter(
示例#40
0
def lmer_feature(formula_str,
                 dat,
                 perms=None,
                 val=None,
                 factors=None,
                 **kwargs):
    """
    Run LMER on a number of permutations of the predicted data.


    """
    # get the perm_var
    perm_var = formula_str.split('~')[0].strip()

    # set the val if necessary
    if not val is None:
        dat[perm_var] = val

    # make factor list if necessary
    if factors is None:
        factors = []

    # convert the recarray to a DataFrame
    rdf = DataFrame({
        k: (FactorVector(dat[k]) if
            (k in factors) or isinstance(dat[k][0], str) else dat[k])
        for k in dat.dtype.names
    })

    #rdf = com.convert_to_r_dataframe(pd.DataFrame(dat),strings_as_factors=True)

    # get the column index
    col_ind = list(rdf.colnames).index(perm_var)

    # make a formula obj
    rformula = Formula(formula_str)

    # just apply to actual data if no perms
    if perms is None:
        #perms = [np.arange(len(dat))]
        perms = [None]

    # run on each permutation
    tvals = None
    for i, perm in enumerate(perms):
        if not perm is None:
            # set the perm
            rdf[col_ind] = rdf[col_ind].rx(perm + 1)

        # inside try block to catch convergence errors
        try:
            ms = lme4.lmer(rformula, data=rdf, **kwargs)
        except:
            continue
            #tvals.append(np.array([np.nan]))
        # extract the result

        df = r['data.frame'](r_coef(r['summary'](ms)))
        if tvals is None:
            # init the data
            # get the row names
            rows = list(r['row.names'](df))
            tvals = np.rec.fromarrays(
                [np.ones(len(perms)) * np.nan for ro in range(len(rows))],
                names=','.join(rows))
        tvals[i] = tuple(df.rx2('t.value'))

    return tvals
示例#41
0
def _convert_to_dataframe(x):
    """ Convert Python list of integers to R data frame. """
    tmp = dict()
    tmp['y'] = IntVector(x)
    return DataFrame(tmp)
示例#42
0
import rpy2.robjects.pandas2ri
from rpy2.robjects.vectors import DataFrame
import math
import datetime

parser = argparse.ArgumentParser()


parser.add_argument("-in_csv", help="")
parser.add_argument("-out", help="")

args=parser.parse_args()



dataf = DataFrame.from_csvfile(args.in_csv, sep = ",",header=True)

# Get statistics for investigated seqs
rmean = robjects.r['mean']
rmed = robjects.r['median']
rmax = robjects.r['max']
rsd = robjects.r['sd']
rsum = robjects.r['sum']

ma=rmax(dataf.rx('hitlen'))

as_vec = robjects.r['as.vector']
as_num = robjects.r['as.numeric']
as_mat = robjects.r['as.matrix']

#test22=as_vec(dataf.rx('Length'))
示例#43
0
def show1():
	open1()
	r.source('D:/Postgraduate/Course/2-semester/R-language/TimeAnalyze/Programe/R/head1.r',encoding="utf-8")
	data = DataFrame.from_csvfile('D:/Postgraduate/Course/2-semester/R-language/TimeAnalyze/Programe/temp/day1.csv')
	pp = ggplot2.ggplot(data)+ggplot2.aes_string(x='project', y='time',fill = 'project')+ggplot2.geom_bar(stat ='identity')+ggplot2.ggtitle("今日项目时间分布图")+ggplot2.labs(x='项目',y='时间 (min)')+ggplot2.theme(**{'axis.text.x': ggplot2.element_text(angle = 45)})
	pp.plot()