Пример #1
0
 def fit(self, x, y):
     """
     Constructs GAM model(s) to predict y from X
     
     x: 1 or 2 dimensional array of predictor values with each row being one observation
     y: 1 or 2 dimensional array of predicted values (a GAM model is constructed for each output if y is 2 dimensional)
     """
     # Input validation for standard estimators using sklearn utils
     x, y = check_X_y(x, y, accept_sparse=["csr", "csc", "coo"], multi_output=True)
     # Convert to R matrices
     if (
         x.ndim == 1
     ):  # If we're only looking at 1 x at a time, shape[1] will give an error for one-dimensional arrays. Sklearn input validation doesn't change that.
         rX = r.matrix(x, nrow=x.shape[0], ncol=1)
     else:
         rX = r.matrix(x, nrow=x.shape[0], ncol=x.shape[1])
     if (
         y.ndim == 1
     ):  # If we're only looking at 1 y at a time, shape[1] will give an error for one-dimensional arrays
         rY = r.matrix(y, nrow=y.shape[0], ncol=1)
     else:
         rY = r.matrix(y, nrow=y.shape[0], ncol=y.shape[1])
     # Compute models (one for each column in y)
     self.gammodels = self.computeGAM(rX, rY)
     return self
Пример #2
0
    def predict(self, x):
        """
        Get the predicted values for the given input values
        Returns an nxm np.array with the predicted y values corresponding to the given x, with m being the number of dependent variables and n the number of observations in x
        
        NOTE: assumes that the dimensions for the predicted y are the same as what was expected from the training, e.g. same amount of dependent variables
        """
        ## Check the estimator has been fit before calling this function
        check_is_fitted(self, "gammodels")

        # input is converted to an at least 2nd numpy array by sklearn util function, this is necessary for handling 1-dimensional x inputs etc. correctly (otherwise also doesn't convert to right amount of columns and rows)
        x = check_array(x, accept_sparse=["csr", "csc", "coo"])

        # Convert to R matrices
        if (
            x.ndim == 1
        ):  # If we're only looking at 1 x at a time, shape[1] will give an error for one-dimensional arrays. Sklearn input validation doesn't change that.
            rx = r.matrix(x, nrow=x.shape[0], ncol=1)
        else:
            rx = r.matrix(x, nrow=x.shape[0], ncol=x.shape[1])
        r.assign("newxdata", rx)  # Put data in R environment for the functions to use
        r("newxdataframe<-data.frame(newxdata)")

        # Use gammodels list to predict each dependent variable and put together in R matrix
        for i, gammodel in enumerate(self.gammodels):
            r.assign("gmodel", gammodel)
            if i == 0:  # array is empty
                r("predmatrix<-predict(gmodel, newxdataframe)")
            else:
                r("predmatrix<-cbind(predmatrix,predict(gmodel,newxdataframe))")
        result = np.asarray(r["predmatrix"])
        return result
Пример #3
0
def fit_glm(data, family='binomial', glmnet_kwargs=None):
    """
    :param data: the data dictionary
    :param family: response type
    :param glmnet_kwargs: dictionary of keyword arguments to pass the glmnet function in R.
    :return: pandas dataframe containing the fit model parameters. Each row corresponds to a unique value for lambda.
    """
    if not packages.isinstalled(name='glmnet'):
        utils = packages.importr('utils')
        utils.chooseCRANmirror(ind=1)
        utils.install_packages('glmnet')

    if not glmnet_kwargs:
        glmnet_kwargs = {}

    # intercept should be added as a constant 1 feature, not via glmnet. Thus, always set to 'False'.
    if 'intercept' in glmnet_kwargs.keys():
        assert glmnet_kwargs['intercept'] is False, \
            "Do not add intercept in glmnet. Please add an intercept feature to the dataset instead."
    glmnet_kwargs['intercept'] = False

    # set default parameters
    if 'alpha' not in glmnet_kwargs.keys():
        glmnet_kwargs['alpha'] = DEFAULT_ALPHA
    if 'nlambda' not in glmnet_kwargs.keys():
        glmnet_kwargs['nlambda'] = DEFAULT_NLAMBDA

    # R set-up
    numpy2ri.activate()
    glmnet = importr('glmnet')

    n_row, n_col = data['X'].shape

    # transfer to R objects
    x_ = r.matrix(data['X'], nrow=n_row)
    y_ = r.matrix(data['Y'], nrow=n_row)
    weights = FloatVector(data['sample_weights'])

    output = glmnet.glmnet(x=x_,
                           y=y_,
                           family=family,
                           weights=weights,
                           **glmnet_kwargs)
    coefs = r.t(sparse_to_full_matrix(output.rx2('beta')))
    coefs = np.array(coefs)  # drop added intercept column which is fixed at 0
    lambda_ = output.rx('lambda')
    lambda_ = np.array(r.matrix(lambda_))[0]

    coef_names = data['variable_names']
    df = pd.DataFrame(coefs, columns=coef_names)

    df['lambda_'] = lambda_
    df['alpha'] = glmnet_kwargs['alpha']

    return df
Пример #4
0
def py2mat(myobj):
    """
    Convert Python series to R matrix.
    """
    if isinstance(myobj, pandas.Series):
        mat = r.matrix(myobj,
                       rownames=myobj.index,
                       dimnames=myobj.name)
    else:
        mat = r.matrix(myobj)
    return mat
Пример #5
0
    def diagnostics(self, fn=None):
        """
        Plot diagnostics for the regression.  If `fn` is provided, then save
        the results to file.  The filetype to be saved is determined by the
        extension.

        Additional kwargs are passed to the saving function (e.g., width=10)
        """
        d = {
                '.pdf': grdevices.pdf,
                '.png': grdevices.png
            }

        if fn:
            ext = os.path.splitext(fn)[1]
            try:
                saver_func = d[ext]
            except KeyError:
                raise ValueError('extension "%s" not supported, '
                        'please use one of %s' % (ext, d.keys()))
            saver_func(file=fn)

        r.layout(r.matrix([1, 2, 3, 4], 2, 2))
        r.plot(self.lm)

        if fn:
            rclose()
        return
Пример #6
0
def ComBat(X, batch, covariate=None, parametric=False, empirical_bayes=True, save_dir=None):
    # Check X
    if not isinstance(X, (pd.DataFrame, pd.Series)):
        if isinstance(X, (list, tuple, np.ndarray, Mapping)):
            df = pd.DataFrame(X)
        else:
            raise TypeError('X must be an array-like object, dictionary or pandas Dataframe/Series')
    else:
        df = X
    row_names = df.index
    r_df = pandas2ri.py2ri(df)
    # Check covariate
    if covariate is None:
        covariate = np.ones((len(batch), 1))
    else:
        if not isinstance(covariate, (list, tuple, np.ndarray)):
            if isinstance(covariate, pd.DataFrame) or isinstance(covariate, pd.Series):
                covariate = covariate.to_numpy()
            else:
                raise TypeError('covariate array must be an array like or pandas Dataframe/Series')
        else:
            covariate = np.array(covariate)
    if len(covariate.shape) == 1:
        covariate = covariate.reshape(-1, 1)
    elif len(covariate.shape) > 2:
        raise ValueError('covariate array must be 1D or 2D')
    nr, nc = covariate.shape
    r_covariate = r.matrix(covariate, nrow=nr, ncol=nc)
    # Check batch
    if not isinstance(batch, (list, tuple, np.ndarray)):
        if isinstance(batch, pd.DataFrame) or isinstance(batch, pd.Series):
            batch = batch.to_numpy()
        else:
            raise TypeError('batch array must be an array like or pandas Dataframe/Series')
    else:
        batch = np.array(batch)
    if len(batch.shape) != 1:
        if len(batch.shape) == 2 and batch.shape[1] == 1:
            batch.reshape(-1)
        else:
            raise ValueError('batch array must be 1D or 2D with second dimension equal to 1')
    if len(np.unique(batch)) <= 1:
        raise ValueError('batch array must have at least 2 classes')
    r_batch = Vector(batch)
    # cwd = os.path.dirname(sys.argv[0])
    cwd = os.path.dirname(os.path.abspath(__file__))
    r.setwd(cwd)
    # r.source('./Statistical_analysis/R_scripts/ComBat.R')
    r.source('./R_scripts/ComBat.R')
    r_dr_results = r.ComBat_harmonization(r_df, r_covariate, r_batch, parametric, empirical_bayes)
    R_object_dict = {}
    keys = r_dr_results.names
    for i in range(len(keys)):
        R_object_dict[keys[i]] = np.array(r_dr_results[i])
    results = pd.DataFrame(R_object_dict)
    results.index = row_names
    if save_dir is not None:
        results.to_excel(os.path.join(save_dir, 'Features_ComBat.xlsx'))
    return results
 def makeCT(x):
     nr, nc = x.shape
     xvec = robjects.FloatVector(x.reshape(x.size))
     xr = r.matrix(xvec, nrow=nc, ncol=nr)
     df = basepy.data_frame(xr)
     ct = basepy.table(df)
     ct = replacezeroes(ct)
     return ct
Пример #8
0
def calc_kmeans(data, clusters, iterations):
    data_as_list = data.flatten()
    data_as_rmatrix = r.matrix(data_as_list, ncol = len(data[0]), byrow = True)
    cluster_indexes = array(r.kmeans(data_as_rmatrix, clusters, iter_max = iterations, nstart = 1, algorithm = "MacQueen")[0])
    reordered_data = []
    for i in range (1, clusters + 1):
        indexes = where(cluster_indexes == i)[0]
        for index in indexes:
            reordered_data.append(data[index])        
    return array(reordered_data)
Пример #9
0
def pool_f(n):
    red = inv_phi(phi(dg, n * 10))
    deg_counts = inv_dg(red, sub, parallel=False)
    with open('data/deg_counts_sd/digamma_deg_' + str(n * 10) + '-comp.pkl',
              'wb') as buff:
        pickle.dump(deg_counts, buff)
    rPath = 'data/deg_counts_sd/digamma_deg_' + str(n * 10) + '-comp.RDS'
    nr, nc = deg_counts.shape
    dg_r = r.matrix(deg_counts, nrow=nr, ncol=nc)
    r.saveRDS(dg_r, rPath)
Пример #10
0
def calc_kmeans_greedy(data, clusters, iterations):
    data_as_list = data.flatten()
    data_as_rmatrix = r.matrix(data_as_list, ncol=len(data[0]), byrow=True)
    cluster_indexes = array(
        r.kmeans(data_as_rmatrix,
                 clusters,
                 iter_max=iterations,
                 nstart=1,
                 algorithm="MacQueen")[0])
    ordered_clusters = order_kcluster_contents(clusters, cluster_indexes, data)
    return order_clusters(ordered_clusters)
Пример #11
0
 def train(self, X, w, y):
     self.X, self.w = X, w
     X = r.matrix(X, nrow=X.shape[0])
     w = ro.FloatVector(w)
     y = ro.FloatVector(y)
     self.rr_model = self.grf.causal_forest(
         X, y, w, **{
             "seed": self.seed,
             "num.trees": self.n_trees,
             "honesty": True,
             "alpha": 0.1,
             "min.node.size": 1
         })
Пример #12
0
def calc_kmeans(data, clusters, iterations):
    data_as_list = data.flatten()
    data_as_rmatrix = r.matrix(data_as_list, ncol=len(data[0]), byrow=True)
    cluster_indexes = array(
        r.kmeans(data_as_rmatrix,
                 clusters,
                 iter_max=iterations,
                 nstart=1,
                 algorithm="MacQueen")[0])
    reordered_data = []
    for i in range(1, clusters + 1):
        indexes = where(cluster_indexes == i)[0]
        for index in indexes:
            reordered_data.append(data[index])
    return array(reordered_data)
def weighted_silhouette(*, diss, lab, weight, **kwargs):
    from rpy2.robjects import r
    from rpy2.robjects.packages import importr
    from rpy2.robjects import numpy2ri
    numpy2ri.activate()
    # %%
    wc = importr('WeightedCluster')
    wsr = r['wcSilhouetteObs']
    # %%

    # %%
    nr, nc = diss.shape
    rdiss = r.matrix(diss, nrow=nr, ncol=nc)
    rlab = r.array(lab.values)
    rtot = r.array(weight.values)
    res = np.array(wsr(rdiss, rlab, weights=rtot, measure="ASWw"))
    return res
Пример #14
0
def convert_to_r_data(data):
    # Input is sumu.Data

    init_r()

    numpy2ri.activate()
    datar = r.matrix(data.all().flatten(),
                     nrow=data.N,
                     ncol=data.n,
                     byrow=True)
    numpy2ri.deactivate()

    discrete = data.discrete
    arities = True if data.arities is not False else False

    datar = r['datapath_or_matrix_to_numeric_dataframe'](datar,
                                                         discrete=discrete,
                                                         arities=arities)
    return datar
def auto_arima(endog, exog=None, freq=None):
    if freq is None:
        freq = 1
    # endog_r = r.ts(pandas2ri.py2ri(endog), freq=freq)
    # if using more recent version of rpy2, py2ri was renamed to py2rpy
    # see reference: https://stackoverflow.com/questions/55990529/module-rpy2-robjects-pandas2ri-has-no-attribute-ri2py
    endog_r = r.ts(pandas2ri.py2rpy(endog), freq=freq)
    autoarima_args = {
        "seasonal": True,
        "stationary": False,
        "trace": True,
        "max.order": 20,
        "max.p": 20,
        "max.q": 20,
        "max.P": 20,
        "max.Q": 20,
        "max.D": 20,
        "max.d": 20,
        "start.p": 1,
        "start.q": 1,
        "start.P": 1,
        "start.Q": 1
    }
    if exog is not None:
        # add noise to avoid rank-deficient error for exog
        scale = np.std(exog.values)
        z = scale * 1e-4 * np.random.randn(*exog.shape)
        exog_r = r.matrix(exog.values + z,
                          nrow=exog.shape[0],
                          ncol=exog.shape[1],
                          dimnames=[[], exog.columns.tolist()])
        fit_r = forecast.auto_arima(y=endog_r, xreg=exog_r, **autoarima_args)
    else:
        fit_r = forecast.auto_arima(y=endog_r, **autoarima_args)
    fit_dict = dict(fit_r.items())
    # for proof of this order see last comment:
    # https://stats.stackexchange.com/questions/178577/how-to-read-p-d-and-q-of-auto-arima
    p, q, P, Q, s, d, D = list(fit_dict["arma"])
    return (p, d, q), (P, D, Q, s)
Пример #16
0
 def caldiffPvalues(self):
     print 'calcularing Statistic P_values(-log10)...','\t',time.strftime('%Y-%m-%d %A %H:%M:%S', time.localtime())
     r['options'](warn = -1)
     i = 0
     for ch in self.getChrlist():
         self.P_values.setdefault(ch,array('f',[]))
         self.diffPoints.setdefault(ch,array('I',[]))
         self.length[ch] = min([len(self.signal[filename][ch]) for filename in self.signal])
         mean = [float(sum(self.signal[filename][ch])/len(self.signal[filename][ch])) for filename in self.signal]
         for j in range(self.length[ch]):
             data = [self.signal[filename][ch][j] for filename in self.signal]
             data.extend(mean)
             test_data = r.matrix(FloatVector(data),ncol =2)
             test = r[self.method](test_data)
             if str(test).split()[-1] != 'NA':
                 p_value = float(str(test).split()[-1])
                 self.P_values[ch].append(-log10(p_value))
                 if p_value < self.cutoff: self.diffPoints[ch].append(j)
             else:
                 self.P_values[ch].append(0)
             if (j%10000 == 0):print "processed %i points for %s" % (j,ch),'\t',time.strftime('%Y-%m-%d %A %H:%M:%S', time.localtime())
         i += 1
         print "%s caldiff finished (Finished %i Chroms)" %(ch,i),'\t',time.strftime('%Y-%m-%d %A %H:%M:%S', time.localtime())
Пример #17
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: r_mann_whitney_u.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option( "-m", "--method", dest="method", type="string",
                       help="method to use [ks=Kolmogorov-Smirnov,mwu=Mann-WhitneyU]")
    parser.add_option( "-a", "--hardcopy", dest="hardcopy", type="string",
                      help="write hardcopy to file.", metavar = "FILE" )
    parser.add_option( "-1", "--infile1", dest="filename_input1", type="string" ,
                       help="input filename for distribution 1.")
    parser.add_option( "-2", "--infile2", dest="filename_input2", type="string" ,
                       help="input filename for distribution 2.")
    parser.add_option( "-p", "--infile-map", dest="filename_input_map", type="string" ,
                       help="input filename for mapping categories to values.")

    parser.set_defaults(
        method = "ks",
        filename_input1 = None,
        filename_input2 = None,
        filename_input_map = None,
        )
    
    (options, args) = E.Start( parser,
                               add_pipe_options = True,
                               add_psql_options = True,)


    map_category2value = {}
    if options.filename_input_map:
        map_category2value = IOTools.ReadMap( open(options.filename_input_map, "r"),
                                              map_functions=(str,float))
    
    values1, errors1 = IOTools.ReadList( open(options.filename_input1, "r"),
                                         map_category=map_category2value )
    values2, errors2 = IOTools.ReadList( open(options.filename_input2, "r"),
                                         map_category=map_category2value )    
    
    E.info( "ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i" % (len(values1), len(errors1),
                                                                  len(values2), len(errors2)) )

    if options.hardcopy:
        R.png(options.hardcopy, width=1024, height=768)

    if options.method == "ks":
        result = R.ks_test( values1, values2 )
    elif options.method == "mwu":
        result = R.wilcox_test( values1, values2, paired=False)

    R.assign("v1", values1)
    R.assign("v2", values2)    

    R.layout(R.matrix((1,2,3,4), 2, 2, byrow = True))
        
    R.boxplot( values1, values2, col=('white','red'), main="Boxplot" )

    R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""")

    R("""hist( v1, freq=FALSE, width=0.5, density=10, main='Relative frequency histogram')""")
    R("""hist( v2, freq=FALSE, add=TRUE,   width=0.5, col='red', offset=0.5, density=20, angle=135)""")
    R("""hist( v1, freq=TRUE,  width=0.5, density=10, main='Absolute frequency histogram')""")
    R("""hist( v2, freq=TRUE,  add=TRUE,   width=0.5, col='red', offset=0.5, density=20, angle=135)""")

    print "## Results for %s" % result['method']
    for x in ['p.value', 'statistic', 'alternative', 'method']:
        print x, result[x]


    E.Stop()
Пример #18
0
    
    E.info( "ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i" % (len(values1), len(errors1),
                                                                  len(values2), len(errors2)) )

    if options.hardcopy:
        R.png(options.hardcopy, width=1024, height=768)

    if options.method == "ks":
        result = R.ks_test( values1, values2 )
    elif options.method == "mwu":
        result = R.wilcox_test( values1, values2, paired=False)

    R.assign("v1", values1)
    R.assign("v2", values2)    

    R.layout(R.matrix((1,2,3,4), 2, 2, byrow = True))
        
    R.boxplot( values1, values2, col=('white','red'), main="Boxplot" )

    R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""")

    R("""hist( v1, freq=FALSE, width=0.5, density=10, main='Relative frequency histogram')""")
    R("""hist( v2, freq=FALSE, add=TRUE,   width=0.5, col='red', offset=0.5, density=20, angle=135)""")
    R("""hist( v1, freq=TRUE,  width=0.5, density=10, main='Absolute frequency histogram')""")
    R("""hist( v2, freq=TRUE,  add=TRUE,   width=0.5, col='red', offset=0.5, density=20, angle=135)""")

    print "## Results for %s" % result['method']
    for x in ['p.value', 'statistic', 'alternative', 'method']:
        print x, result[x]

Пример #19
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: r_compare_distributions.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-m", "--method", dest="method", type="choice",
                      help="method to use: ks=Kolmogorov-Smirnov, mwu=Mann-WhitneyU, shapiro=Shapiro-Wilk, paired-mwu=paired Mann-WhitneyU, paired-t=paired t-test [default=%default]",
                      choices=("ks", "mwu", "shapiro", "paired-mwu", "paired-t"))
    parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string",
                      help="write hardcopy to file.", metavar="FILE")
    parser.add_option("-1", "--infile1", dest="filename_input1", type="string",
                      help="input filename for distribution 1.")
    parser.add_option("-2", "--infile2", dest="filename_input2", type="string",
                      help="input filename for distribution 2.")
    parser.add_option("--plot-legend", dest="legend", type="string",
                      help="legend for histograms.""")
    parser.add_option("-f", "--infile-map", dest="filename_input_map", type="string",
                      help="input filename for mapping categories to values.")
    parser.add_option("-n", "--norm-test", dest="norm_test", action="store_true",
                      help="""test if a set of values is normally distributed. Mean and variance
                       are calculated from the data.""")
    parser.add_option("-b", "--num-bins", dest="num_bins", type="int",
                      help="""number of bins (for plotting purposes only).""")
    parser.add_option("--bin-size", dest="bin_size", type="float",
                      help="""bin size for plot.""")
    parser.add_option("--min-value", dest="min_value", type="float",
                      help="""minimum_value for plot.""")
    parser.add_option("--max-value", dest="max_value", type="float",
                      help="""maximum_value for plot.""")
    parser.add_option("--skip-plot", dest="plot", action="store_false",
                      help="""skipping plotting.""")
    parser.add_option("--header-names", dest="header", type="string",
                      help="""header of value column [default=%default].""")
    parser.add_option("--title", dest="title", type="string",
                      help="""plot title [default=%default].""")

    parser.set_defaults(
        method="ks",
        filename_input1=None,
        filename_input2=None,
        filename_input_map=None,
        legend=None,
        norm_test=False,
        num_bins=0,
        legend_range="2,2",
        bin_size=None,
        min_value=None,
        plot=True,
        header="value",
        title=None,
    )

    (options, args) = E.Start(parser,
                              add_pipe_options=True)

    kwargs = {}
    xargs = []
    for arg in args:
        if "=" in arg:
            key, value = arg.split("=")
            kwargs[key] = value
        else:
            xargs.append(arg)

    if options.legend:
        options.legend = options.legend.split(",")

    map_category2value = {}
    if options.filename_input_map:
        map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"),
                                             map_functions=(str, float))
        f = str
    else:
        f = float

    if options.filename_input1:
        infile1 = IOTools.openFile(options.filename_input1, "r")
    else:
        infile1 = sys.stdin

    values1, errors1 = IOTools.ReadList(infile1,
                                        map_function=f,
                                        map_category=map_category2value)

    if options.filename_input1:
        infile1.close()

    if errors1 and options.loglevel >= 3:
        options.stdlog.write("# errors in input1: %s\n" %
                             ";".join(map(str, errors1)))

    if options.norm_test:
        mean = R.mean(values1)
        stddev = R.sd(values1)
        options.stdlog.write("# creating %i samples from normal distribution with mean %f and stddev %f\n" % (
            len(values1), mean, stddev))

        values2 = R.rnorm(len(values1), mean, stddev)
        errors2 = ()
    else:
        values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"),
                                            map_function=f,
                                            map_category=map_category2value)

    if errors2 and options.loglevel >= 3:
        options.stdlog.write("# errors in input2: %s\n" %
                             ";".join(map(str, errors2)))

    if options.loglevel >= 1:
        options.stdlog.write("# ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i\n" % (len(values1), len(errors1),
                                                                                       len(values2), len(errors2)))

    if options.method in ("paired-mwu", "paired-t"):
        if len(values1) != len(values2):
            raise ValueError(
                "number of values must be equal for paired tests.")

    if options.hardcopy:
        R.png(options.hardcopy, width=1024, height=768)

    if options.method == "ks":
        result = R.ks_test(values1, values2, *xargs, **kwargs)
    elif options.method == "mwu":
        result = R.wilcox_test(
            values1, values2, paired=False, correct=True, *xargs, **kwargs)
    elif options.method == "paired-mwu":
        result = R.wilcox_test(
            values1, values2, paired=True, correct=True, *xargs, **kwargs)
    elif options.method == "paired-t":
        result = R.t_test(values1, values2, paired=True, *xargs, **kwargs)
    elif options.method == "shapiro":
        if len(values1) > 5000:
            E.warn(
                "shapiro-wilk test only accepts < 5000 values, a random sample has been created.")
            values1 = random.sample(values1, 5000)
        result = R.shapiro_test(values1, *xargs, **kwargs)

    if options.plot:
        R.assign("v1", values1)
        R.assign("v2", values2)

        if options.title:
            # set the size of the outer margins - the title needs to be added at the end
            # after plots have been created
            R.par(oma=R.c(0, 0, 4, 0))

        R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True))

        R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot")
        R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""")

        # compute breaks:

        min_value = min(min(values1), min(values2))
        if options.min_value is not None:
            min_value = min(min_value, options.min_value)

        max_value = max(max(values1), max(values2))
        if options.max_value is not None:
            max_value = max(max_value, options.max_value)

        extra_options = ""
        if options.num_bins and not (options.min_value or options.max_value):
            extra_options += ", breaks=%i" % options.num_bins

        elif options.num_bins and (options.min_value or options.max_value):
            bin_size = float((max_value - min_value)) / (options.num_bins + 1)
            breaks = [
                min_value + x * bin_size for x in range(options.num_bins)]
            extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks))

        elif options.bin_size is not None:
            num_bins = int(((max_value - min_value) / options.bin_size)) + 1
            breaks = [
                min_value + x * options.bin_size for x in range(num_bins + 1)]
            extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks))

        R("""h1 <- hist( v1, freq=FALSE,           density=20, main='Relative frequency histogram' %s)""" %
          extra_options)
        R("""h2 <- hist( v2, freq=FALSE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s)""" %
          extra_options)
        if options.legend:
            R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2,
            max( max(h1$density), max(h2$density)) / 2, c('%s'), fill=c('white','red'))""" % (
                "','".join(options.legend)))

        R("""h1 <- hist( v1, freq=TRUE,            density=20, main='Absolute frequency histogram' %s)""" %
          extra_options)
        R("""h2 <- hist( v2, freq=TRUE,  add=TRUE, density=20, col='red', offset=0.5, angle=135 %s )""" %
          extra_options)
        if options.legend:
            R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2,
            max( max(h1$counts), max(h2$counts)) / 2, c('%s'), fill=c('white','red'))""" % (
                "','".join(options.legend)))

        if options.title:
            R.mtext(options.title, 3, outer=True, line=1, cex=1.5)

    if options.loglevel >= 1:
        options.stdout.write("## Results for %s\n" % result['method'])

    options.stdout.write("%s\t%s\n" % ("key", options.header))

    for key in list(result.keys()):
        if key == "data.name":
            continue
        options.stdout.write("\t".join((key, str(result[key]))) + "\n")

    stat = Stats.Summary(values1)
    for key, value in list(stat.items()):
        options.stdout.write("%s1\t%s\n" % (str(key), str(value)))

    stat = Stats.Summary(values2)
    for key, value in list(stat.items()):
        options.stdout.write("%s2\t%s\n" % (str(key), str(value)))

    if options.plot:
        if options.hardcopy:
            R.dev_off()

    E.Stop()
Пример #20
0
    def quantileNormalize(self):
        '''
        Description:
            Normalize between Wig class instances by Quantile
        Parameter:
            None
        Value:
            None
        '''
        ss = time()
        self.ensureSameChrsByRemove()
        wigs = self.data
        r('require("preprocessCore")')
        normq = r('normalize.quantiles')

        chrs = {}  #now it is a dictionary, but will be change to a list later
        for wig in wigs:
            for chr in wigs[wig].data:
                if chr in chrs: chrs[chr] += 1
                else: chrs[chr] = 1
        wnum = len(list(wigs.keys()))
        '''
        pops=[]
        for chr in chrs:
            if chrs[chr]<wnum:pops.append(chr)
        for chr in pops:chrs.pop(chr)
        '''
        chrs = list(chrs.keys())  #now chrs is a list
        names = list(wigs.keys())
        num = len(names)
        sizes = {}
        size = 0
        for chr in chrs:
            for name in names:
                if chr not in wigs[name].data:
                    wigs[name].data[chr] = numpy.array([0.0])
                if chr not in sizes: sizes[chr] = wigs[name].data[chr].size
                elif sizes[chr] < wigs[name].data[chr].size:
                    sizes[chr] = wigs[name].data[chr].size
            size += sizes[chr]
        lst = numpy.array([0.0])
        lst.resize(size * num, refcheck=0)

        for i in range(0, num):
            name = names[i]
            tsize = size * i
            for j in range(0, len(chrs)):
                chr = chrs[j]
                wigs[name].data[chr].resize(sizes[chr], refcheck=0)
                ttsize = tsize + sizes[chr]
                lst[tsize:ttsize] += wigs[name].data[chr][:sizes[chr]]
                tsize = ttsize
        mtr = r.matrix(FloatVector(lst), nrow=size, ncol=num)
        nmtr = normq(mtr)
        for i in range(0, num):
            name = names[i]
            tsize = size * i
            for j in range(0, len(chrs)):
                chr = chrs[j]
                ttsize = tsize + sizes[chr]
                wigs[name].data[chr][:sizes[chr]] = lst[tsize:ttsize]
                tsize = ttsize
        sys.stdout.write('time cost', str(time() - ss) + "\n")
        return 1
Пример #21
0
def facetedGGSeqLogo(logodata, chars, plotfile, width, height,
        ncol=None, char_colors=AA_COLORS_FG, xlabelsrotate=True):
    """Creates faceted logo plot.

    Designed to show several measurements on the same site
    site-by-side, potentially for many sites. Each site
    must have the same set of measurements.

    Makes panel of logo plots faceted on `logodata['facetlabel']`,
    where character stacks are labeled by `logodata['stacklabel']`
    and show the characters at the indicated heights.

    Args:
        `logodata` (pandas DataFrame)
            Contains data to plot. Should have the columns
            `facetlabel`, `stacklabel`, and a column giving the
            height of each character in `chars`.
        `chars` (list)
            Letters for which we plot heights.
        `plotfile` (str)
            Name of created plot.
        `width` (float)
            Width of plot in inches.
        `height` (float)
            Height of plot in inches.
        `ncol` (int or `None`)
            Number of columns in faceted plot. If `None`, use
            as many as needed to plot everything in one row.
        `char_colors` (dict)
            Values give color for every character in `chars`.
        `xlabelsrotate` (bool)
            Do we rotate the x-labels?

    Here is an example that creates two facets each with
    two stacks for the characters `A` and `C`:

    >>> logodata = pandas.read_csv(io.StringIO(
    ...     '''facetlabel  stacklabel   A   C
    ...            site-1       BF520 0.8 0.2
    ...            site-1       BG505 0.9 0.1
    ...            site-2       BF520 0.4 0.6
    ...            site-2       BG505 0.5 0.5'''),
    ...     delim_whitespace=True, index_col=False)
    >>> plotfile = '_facetedGGSeqLogo_test_plot.png'
    >>> facetedGGSeqLogo(logodata,
    ...         chars=['A', 'C'],
    ...         plotfile=plotfile,
    ...         width=3, height=2.5
    ...         )
    >>> os.path.isfile(plotfile)
    True

    Here is the plot created by the code block above:

    .. image:: _static/_facetedGGSeqLogo_test_plot.png
       :width: 40%
       :align: center

    """
    if os.path.isfile(plotfile):
        os.remove(plotfile)

    assert set(chars) <= set(char_colors.keys()), \
            "`char_colors` not defined for all chars"

    # get and order data columns
    df_cols = ['facetlabel', 'stacklabel'] + chars
    assert set(logodata.columns) >= set(df_cols), "df lacks required columns"
    logodata = logodata[df_cols] 

    facets = logodata['facetlabel'].unique()
    stacks = logodata['stacklabel'].unique()
    if ncol is None:
        ncol = len(facets)

    # generate list of matrices to facet
    matrices = []
    for f in facets:
        facetdata = (logodata.query('facetlabel == @f')
                     .drop('facetlabel', axis=1)
                     .set_index('stacklabel')
                     .reindex(stacks)
                     .fillna(0)
                     )
        m = r.matrix(
                facetdata.values.ravel(),
                ncol=len(stacks),
                dimnames=[chars, stacks]
                )
        matrices.append(m)
    matrices = ListVector(TaggedList(matrices,
            tags=facets.astype('str')))

    # make the plot
    with warnings.catch_warnings():
        warnings.simplefilter(SHOW_WARNINGS)
        _RFUNCS.facetedGGSeqLogo(
                matrices=matrices,
                plotfile=plotfile,
                ncol=ncol,
                width=width,
                height=height,
                xname='',
                xlabels=stacks,
                xlabelsrotate=xlabelsrotate,
                xline=True,
                yname='',
                chars=StrVector(chars),
                char_colors=StrVector([char_colors[x] for x in chars])
                )

    if not os.path.isfile(plotfile):
        raise RuntimeError("failed to create {0}".format(plotfile))
Пример #22
0
 def quantileNormalize(self):
     '''
     Description:
         Normalize between Wig class instances by Quantile
     Parameter:
         None
     Value:
         None
     '''
     ss=time()
     self.ensureSameChrsByRemove()
     wigs=self.data
     r('require("preprocessCore")')
     normq=r('normalize.quantiles')
     
     chrs={}#now it is a dictionary, but will be change to a list later
     for wig in wigs:
         for chr in wigs[wig].data:
             if chrs.has_key(chr):chrs[chr]+=1
             else:chrs[chr]=1
     wnum=len(wigs.keys())
     '''
     pops=[]
     for chr in chrs:
         if chrs[chr]<wnum:pops.append(chr)
     for chr in pops:chrs.pop(chr)
     '''
     chrs=chrs.keys()#now chrs is a list
     names=wigs.keys()
     num=len(names)
     sizes={}
     size=0
     for chr in chrs:
         for name in names:
             if not wigs[name].data.has_key(chr):wigs[name].data[chr]=numpy.array([0.0])
             if not sizes.has_key(chr):sizes[chr]=wigs[name].data[chr].size
             elif sizes[chr]<wigs[name].data[chr].size:sizes[chr]=wigs[name].data[chr].size
         size+=sizes[chr]
     lst=numpy.array([0.0])
     lst.resize(size*num,refcheck=0)
     
     for i in range(0,num):
         name = names[i]
         tsize=size*i
         for j in range(0,len(chrs)):
             chr = chrs[j]
             wigs[name].data[chr].resize(sizes[chr],refcheck=0)
             ttsize=tsize+sizes[chr]
             lst[tsize:ttsize]+=wigs[name].data[chr][:sizes[chr]]
             tsize=ttsize
     mtr=r.matrix(FloatVector(lst),nrow = size, ncol = num)
     nmtr=normq(mtr)
     for i in range(0,num):
         name=names[i]
         tsize=size*i
         for j in range(0,len(chrs)):
             chr = chrs[j]
             ttsize=tsize+sizes[chr]
             wigs[name].data[chr][:sizes[chr]]=lst[tsize:ttsize]
             tsize=ttsize
     print 'time cost',time()-ss
     return 1
Пример #23
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: r_compare_distributions.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="choice",
        help=
        "method to use: ks=Kolmogorov-Smirnov, mwu=Mann-WhitneyU, shapiro=Shapiro-Wilk, paired-mwu=paired Mann-WhitneyU, paired-t=paired t-test [default=%default]",
        choices=("ks", "mwu", "shapiro", "paired-mwu", "paired-t"))
    parser.add_option("-a",
                      "--hardcopy",
                      dest="hardcopy",
                      type="string",
                      help="write hardcopy to file.",
                      metavar="FILE")
    parser.add_option("-1",
                      "--infile1",
                      dest="filename_input1",
                      type="string",
                      help="input filename for distribution 1.")
    parser.add_option("-2",
                      "--infile2",
                      dest="filename_input2",
                      type="string",
                      help="input filename for distribution 2.")
    parser.add_option("--plot-legend",
                      dest="legend",
                      type="string",
                      help="legend for histograms."
                      "")
    parser.add_option("-f",
                      "--infile-map",
                      dest="filename_input_map",
                      type="string",
                      help="input filename for mapping categories to values.")
    parser.add_option(
        "-n",
        "--norm-test",
        dest="norm_test",
        action="store_true",
        help=
        """test if a set of values is normally distributed. Mean and variance
                       are calculated from the data.""")
    parser.add_option("-b",
                      "--num-bins",
                      dest="num_bins",
                      type="int",
                      help="""number of bins (for plotting purposes only).""")
    parser.add_option("--bin-size",
                      dest="bin_size",
                      type="float",
                      help="""bin size for plot.""")
    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="""minimum_value for plot.""")
    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="""maximum_value for plot.""")
    parser.add_option("--skip-plot",
                      dest="plot",
                      action="store_false",
                      help="""skipping plotting.""")
    parser.add_option("--header-names",
                      dest="header",
                      type="string",
                      help="""header of value column [default=%default].""")
    parser.add_option("--title",
                      dest="title",
                      type="string",
                      help="""plot title [default=%default].""")

    parser.set_defaults(
        method="ks",
        filename_input1=None,
        filename_input2=None,
        filename_input_map=None,
        legend=None,
        norm_test=False,
        num_bins=0,
        legend_range="2,2",
        bin_size=None,
        min_value=None,
        plot=True,
        header="value",
        title=None,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    kwargs = {}
    xargs = []
    for arg in args:
        if "=" in arg:
            key, value = arg.split("=")
            kwargs[key] = value
        else:
            xargs.append(arg)

    if options.legend:
        options.legend = options.legend.split(",")

    map_category2value = {}
    if options.filename_input_map:
        map_category2value = IOTools.ReadMap(open(options.filename_input_map,
                                                  "r"),
                                             map_functions=(str, float))
        f = str
    else:
        f = float

    if options.filename_input1:
        infile1 = IOTools.openFile(options.filename_input1, "r")
    else:
        infile1 = sys.stdin

    values1, errors1 = IOTools.ReadList(infile1,
                                        map_function=f,
                                        map_category=map_category2value)

    if options.filename_input1:
        infile1.close()

    if errors1 and options.loglevel >= 3:
        options.stdlog.write("# errors in input1: %s\n" %
                             ";".join(map(str, errors1)))

    if options.norm_test:
        mean = R.mean(values1)
        stddev = R.sd(values1)
        options.stdlog.write(
            "# creating %i samples from normal distribution with mean %f and stddev %f\n"
            % (len(values1), mean, stddev))

        values2 = R.rnorm(len(values1), mean, stddev)
        errors2 = ()
    else:
        values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"),
                                            map_function=f,
                                            map_category=map_category2value)

    if errors2 and options.loglevel >= 3:
        options.stdlog.write("# errors in input2: %s\n" %
                             ";".join(map(str, errors2)))

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i\n" %
            (len(values1), len(errors1), len(values2), len(errors2)))

    if options.method in ("paired-mwu", "paired-t"):
        if len(values1) != len(values2):
            raise ValueError(
                "number of values must be equal for paired tests.")

    if options.hardcopy:
        R.png(options.hardcopy, width=1024, height=768)

    if options.method == "ks":
        result = R.ks_test(values1, values2, *xargs, **kwargs)
    elif options.method == "mwu":
        result = R.wilcox_test(values1,
                               values2,
                               paired=False,
                               correct=True,
                               *xargs,
                               **kwargs)
    elif options.method == "paired-mwu":
        result = R.wilcox_test(values1,
                               values2,
                               paired=True,
                               correct=True,
                               *xargs,
                               **kwargs)
    elif options.method == "paired-t":
        result = R.t_test(values1, values2, paired=True, *xargs, **kwargs)
    elif options.method == "shapiro":
        if len(values1) > 5000:
            E.warn(
                "shapiro-wilk test only accepts < 5000 values, a random sample has been created."
            )
            values1 = random.sample(values1, 5000)
        result = R.shapiro_test(values1, *xargs, **kwargs)

    if options.plot:
        R.assign("v1", values1)
        R.assign("v2", values2)

        if options.title:
            # set the size of the outer margins - the title needs to be added at the end
            # after plots have been created
            R.par(oma=R.c(0, 0, 4, 0))

        R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True))

        R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot")
        R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );"""
          )

        # compute breaks:

        min_value = min(min(values1), min(values2))
        if options.min_value is not None:
            min_value = min(min_value, options.min_value)

        max_value = max(max(values1), max(values2))
        if options.max_value is not None:
            max_value = max(max_value, options.max_value)

        extra_options = ""
        if options.num_bins and not (options.min_value or options.max_value):
            extra_options += ", breaks=%i" % options.num_bins

        elif options.num_bins and (options.min_value or options.max_value):
            bin_size = float((max_value - min_value)) / (options.num_bins + 1)
            breaks = [
                min_value + x * bin_size for x in range(options.num_bins)
            ]
            extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks))

        elif options.bin_size is not None:
            num_bins = int(((max_value - min_value) / options.bin_size)) + 1
            breaks = [
                min_value + x * options.bin_size for x in range(num_bins + 1)
            ]
            extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks))

        R("""h1 <- hist( v1, freq=FALSE,           density=20, main='Relative frequency histogram' %s)"""
          % extra_options)
        R("""h2 <- hist( v2, freq=FALSE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s)"""
          % extra_options)
        if options.legend:
            R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2,
            max( max(h1$density), max(h2$density)) / 2, c('%s'), fill=c('white','red'))"""
              % ("','".join(options.legend)))

        R("""h1 <- hist( v1, freq=TRUE,            density=20, main='Absolute frequency histogram' %s)"""
          % extra_options)
        R("""h2 <- hist( v2, freq=TRUE,  add=TRUE, density=20, col='red', offset=0.5, angle=135 %s )"""
          % extra_options)
        if options.legend:
            R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2,
            max( max(h1$counts), max(h2$counts)) / 2, c('%s'), fill=c('white','red'))"""
              % ("','".join(options.legend)))

        if options.title:
            R.mtext(options.title, 3, outer=True, line=1, cex=1.5)

    if options.loglevel >= 1:
        options.stdout.write("## Results for %s\n" % result['method'])

    options.stdout.write("%s\t%s\n" % ("key", options.header))

    for key in list(result.keys()):
        if key == "data.name":
            continue
        options.stdout.write("\t".join((key, str(result[key]))) + "\n")

    stat = Stats.Summary(values1)
    for key, value in list(stat.items()):
        options.stdout.write("%s1\t%s\n" % (str(key), str(value)))

    stat = Stats.Summary(values2)
    for key, value in list(stat.items()):
        options.stdout.write("%s2\t%s\n" % (str(key), str(value)))

    if options.plot:
        if options.hardcopy:
            R.dev_off()

    E.Stop()
Пример #24
0
def calc_kmeans_greedy(data, clusters, iterations):
    data_as_list = data.flatten()
    data_as_rmatrix = r.matrix(data_as_list, ncol = len(data[0]), byrow = True)
    cluster_indexes = array(r.kmeans(data_as_rmatrix, clusters, iter_max = iterations, nstart = 1, algorithm = "MacQueen")[0])
    ordered_clusters = order_kcluster_contents(clusters, cluster_indexes, data)
    return order_clusters(ordered_clusters)    
Пример #25
0
def buildUTRExtension(infile, outfile):
    '''build new utrs by building and fitting an HMM 
    to reads upstream and downstream of known genes.

    Works on output of buildGeneLevelReadExtension.

    Known problems

    * the size of the extension is limited by the window size

    * introns within UTRs are ignored.

    * UTR extension might be underestimated for highly expressed genes
      as relative read counts drop off quickly, even though there is
      a good amount of reads still present in the UTR.

    The model

    The model is a three-state model::

        UTR --|--> notUTR --|--> otherTranscript --|
          ^---|      ^------|              ^-------|
                     ^-----------------------------|

    The chain starts in UTR and ends in notUTr or otherTranscript.

    The otherTranscript state models peaks of within the upstream/
    downstream region of a gene. These peaks might correspond to
    additional exons or unknown transcripts. Without this state,
    the UTR might be artificially extend to include these peaks.

    Emissions are modelled with beta distributions. These
    distributions permit both bimodal (UTR) and unimodal (notUTR)
    distribution of counts.

    Parameter estimation

    Parameters are derived from known UTRs within full length 
    territories.
    
    Transitions and emissions for the otherTranscript state
    are set heuristically:

       * low probabibily for remaining in state "otherTranscript".
           * these transcripts should be short.

       * emissions biased towards high counts - only strong signals
           will be considered.

       * these could be estimated from known UTRs, but I am worried
           UTR extensions then will be diluted.
    

    Alternatives

    The method could be improved.

        * base level resolution? 
            * longer chains result in more data and longer running times.
            * the averaging in windows smoothes the data, which might have
                a beneficial effect.

        * raw counts instead of scaled counts?
            * better model, as highly expressed genes should give more
                confident predictions.

    '''

    # the bin size , see gtf2table - can be cleaned from column names
    # or better set as options in .ini file
    binsize = 100
    territory_size = 15000

    # read gene coordinates
    geneinfos = {}
    for x in CSV.DictReader(IOTools.openFile(infile), dialect='excel-tab'):
        contig, strand, start, end = x['contig'], x['strand'], int(
            x['start']), int(x['end'])
        geneinfos[x['gene_id']] = (contig, strand, start, end)

    infiles = [
        infile + ".readextension_upstream_sense.tsv.gz",
        infile + ".readextension_downstream_sense.tsv.gz"
    ]

    outdir = os.path.join(PARAMS["exportdir"], "utr_extension")

    R('''suppressMessages(library(RColorBrewer))''')
    R('''suppressMessages(library(MASS))''')
    R('''suppressMessages(library(HiddenMarkov))''')

    # for upstream, downstream
    upstream_utrs, downstream_utrs = {}, {}

    all_genes = set()

    for filename, new_utrs in zip(infiles, (upstream_utrs, downstream_utrs)):

        E.info("processing %s" % filename)

        parts = os.path.basename(filename).split(".")

        data = R(
            '''data = read.table( gzfile( "%(filename)s"), header=TRUE, fill=TRUE, row.names=1)'''
            % locals())

        ##########################################
        ##########################################
        ##########################################
        ## estimation
        ##########################################
        # take only those with a 'complete' territory
        R('''d = data[-which( apply( data,1,function(x)any(is.na(x)))),]''')
        # save UTR
        R('''utrs = d$utr''')
        # remove length and utr column
        R('''d = d[-c(1,2)]''')
        # remove those which are completely empty, logtransform or scale data and export
        R('''lraw = log10( d[-which( apply(d,1,function(x)all(x==0))),] + 1 )'''
          )

        utrs = R('''utrs = utrs[-which( apply(d,1,function(x)all(x==0)))]''')
        scaled = R(
            '''lscaled = t(scale(t(lraw), center=FALSE, scale=apply(lraw,1,max) ))'''
        )
        exons = R('''lraw[,1]''')

        #######################################################
        #######################################################
        #######################################################
        # do the estimation:
        E.debug("estimation: utrs=%i, exons=%i, vals=%i, dim=%s" %
                (len(utrs), len(exons), len(scaled), R.dim(scaled)))
        # counts within and outside UTRs
        within_utr, outside_utr, otherTranscript = [], [], []
        # number of transitions between utrs
        transitions = numpy.zeros((3, 3), numpy.int)

        for x in xrange(len(utrs)):
            utr, exon = utrs[x], exons[x]

            # only consider genes with expression coverage
            # note: expression level is logscaled here, 10^1 = 10
            if exon < 0.1: continue

            # first row is column names, so x + 1
            values = list(scaled.rx(x + 1, True))

            utr_bins = utr // binsize
            nonutr_bins = (territory_size - utr) // binsize

            # build transition matrix
            transitions[0][0] += utr_bins
            transitions[0][1] += 1
            transitions[1][1] += nonutr_bins

            outside_utr.extend([x for x in values[utr_bins:] if x <= 0.5])

            # ignore exon and zero counts
            within_utr.extend([x for x in values[1:utr_bins] if x > 0.1])

            # add only high counts to otherTranscript emissions
            otherTranscript.extend([x for x in values[utr_bins:] if x > 0.5])

        # estimation for
        # 5% chance of transiting to otherTranscript
        transitions[1][2] = transitions[1][1] * 0.05
        # 10% chance of remaining in otherTranscript
        transitions[2][1] = 900
        transitions[2][2] = 100

        E.info( "counting: (n,mean): within utr=%i,%f, outside utr=%i,%f, otherTranscript=%i,%f" % \
                    ( len(within_utr), numpy.mean(within_utr),
                      len(outside_utr), numpy.mean(outside_utr),
                      len(otherTranscript), numpy.mean(otherTranscript)) )

        ro.globalenv['transitions'] = R.matrix(transitions, nrow=3, ncol=3)
        R('''transitions = transitions / rowSums( transitions )''')
        ro.globalenv['within_utr'] = ro.FloatVector(within_utr[:10000])
        ro.globalenv['outside_utr'] = ro.FloatVector(outside_utr[:10000])
        ro.globalenv['otherTranscript'] = ro.FloatVector(
            otherTranscript[:10000])

        # estimate beta distribution parameters
        R('''doFit = function( data ) {
                   data[data == 0] = data[data == 0] + 0.001
                   data[data == 1] = data[data == 1] - 0.001
                   f = fitdistr( data, dbeta, list( shape1=0.5, shape2=0.5 ) )
                   return (f) }''')

        fit_within_utr = R(
            '''fit_within_utr = suppressMessages(doFit( within_utr))''')
        fit_outside_utr = R(
            '''fit_outside_utr = suppressMessages(doFit( outside_utr))''')
        fit_other = R(
            '''fit_otherTranscript = suppressMessages(doFit( otherTranscript))'''
        )

        within_a, within_b = list(fit_within_utr.rx("estimate"))[0]
        outside_a, outside_b = list(fit_outside_utr.rx("estimate"))[0]
        other_a, other_b = list(fit_other.rx("estimate"))[0]

        E.info( "beta estimates: within_utr=%f,%f outside=%f,%f, other=%f,%f" % \
                    (within_a, within_b, outside_a, outside_b, other_a, other_b))

        fn = ".".join((parts[0], parts[4], "fit", "png"))
        outfilename = os.path.join(outdir, fn)
        R.png(outfilename, height=1000, width=1000)

        R('''par(mfrow=c(3,1))''')
        R('''x=seq(0,1,0.02)''')
        R('''hist( within_utr, 50, col=rgb( 0,0,1,0.2) )''')
        R('''par(new=TRUE)''')
        R('''plot( x, dbeta( x, fit_within_utr$estimate['shape1'], fit_within_utr$estimate['shape2']), type='l', col='blue')'''
          )

        R('''hist( outside_utr, 50, col=rgb( 1,0,0,0.2 ) )''')
        R('''par(new=TRUE)''')
        R('''plot( x, dbeta( x, fit_outside_utr$estimate['shape1'], fit_outside_utr$estimate['shape2']), type='l', col='red')'''
          )

        R('''hist( otherTranscript, 50, col=rgb( 0,1,0,0.2 ) )''')
        R('''par(new=TRUE)''')
        R('''plot( x, dbeta( x, fit_otherTranscript$estimate['shape1'], fit_otherTranscript$estimate['shape2']), type='l', col='green')'''
          )
        R['dev.off']()

        #####################################################
        #####################################################
        #####################################################
        # build hmm
        # state 1 = UTR
        # state 2 = notUTR
        # state 3 = other transcript
        p = R('''betaparams = list( shape1=c(fit_within_utr$estimate['shape1'],
                                         fit_outside_utr$estimate['shape1'],
                                         fit_otherTranscript$estimate['shape1']),
                                shape2=c(fit_within_utr$estimate['shape2'],
                                         fit_outside_utr$estimate['shape2'],
                                         fit_otherTranscript$estimate['shape2'])) '''
              )
        R('''hmm = dthmm(NULL, transitions, c(1,0,0), "beta", betaparams )''')

        E.info("fitting starts")
        #####################################################
        #####################################################
        #####################################################
        # fit to every sequence
        genes = R('''rownames(data)''')
        all_genes.update(set(genes))
        utrs = R('''data$utr''')
        exons = R('''data$exon''')
        nseqs = len(utrs)

        counter = E.Counter()

        for idx in xrange(len(utrs)):

            gene_id = genes[idx]

            old_utr = utrs[idx]

            if idx % 100 == 0:
                E.debug("processing gene %i/%i" % (idx, len(utrs)))

            counter.input += 1

            # do not predict if terminal exon not expressed
            if exons[idx] < 1:
                counter.skipped_notexpressed += 1
                new_utrs[gene_id] = Utr._make(
                    (old_utr, None, None, "notexpressed"))
                continue

            R('''obs = data[%i,][-c(1,2)]''' % (idx + 1))
            # remove na
            obs = R('''obs = obs[!is.na(obs)]''')
            if len(obs) <= 1 or max(obs) == 0:
                new_utrs[gene_id] = Utr._make(
                    (old_utr, None, None, "no observations"))
                continue

            # normalize
            R('''obs = obs / max(obs)''')
            # add small epsilon to 0 and 1 values
            R('''obs[obs==0] = obs[obs==0] + 0.001 ''')
            R('''obs[obs==1] = obs[obs==1] - 0.001 ''')
            R('''hmm$x = obs''')

            states = None
            try:
                states = list(R('''states = Viterbi( hmm )'''))
            except ri.RRuntimeError, msg:
                counter.skipped_error += 1
                new_utrs[gene_id] = Utr._make((old_utr, None, None, "fail"))
                continue

            max_utr = binsize * (len(states) - 1)

            # subtract 1 for last exon
            try:
                new_utr = binsize * (states.index(2) - 1)
                new_utrs[gene_id] = Utr._make(
                    (old_utr, new_utr, max_utr, "ok"))
                counter.success += 1
            except ValueError:
                new_utrs[gene_id] = Utr._make(
                    (old_utr, max_utr, max_utr, "max"))
                counter.maxutr += 1
Пример #26
0
def buildUTRExtension(infile, outfile):
    '''build new utrs by building and fitting an HMM
    to reads upstream and downstream of known genes.

    Works on output of buildGeneLevelReadExtension.

    Known problems

    * the size of the extension is limited by the window size

    * introns within UTRs are ignored.

    * UTR extension might be underestimated for highly expressed genes
      as relative read counts drop off quickly, even though there is
      a good amount of reads still present in the UTR.

    The model

    The model is a three-state model::

        UTR --|--> notUTR --|--> otherTranscript --|
          ^---|      ^------|              ^-------|
                     ^-----------------------------|

    The chain starts in UTR and ends in notUTr or otherTranscript.

    The otherTranscript state models peaks of within the upstream/
    downstream region of a gene. These peaks might correspond to
    additional exons or unknown transcripts. Without this state,
    the UTR might be artificially extend to include these peaks.

    Emissions are modelled with beta distributions. These
    distributions permit both bimodal (UTR) and unimodal (notUTR)
    distribution of counts.

    Parameter estimation

    Parameters are derived from known UTRs within full length 
    territories.

    Transitions and emissions for the otherTranscript state
    are set heuristically:

       * low probabibily for remaining in state "otherTranscript".
           * these transcripts should be short.

       * emissions biased towards high counts - only strong signals
           will be considered.

       * these could be estimated from known UTRs, but I am worried
           UTR extensions then will be diluted.


    Alternatives

    The method could be improved.

        * base level resolution? 
            * longer chains result in more data and longer running times.
            * the averaging in windows smoothes the data, which might have
                a beneficial effect.

        * raw counts instead of scaled counts?
            * better model, as highly expressed genes should give more
                confident predictions.

    '''

    # the bin size , see gtf2table - can be cleaned from column names
    # or better set as options in .ini file
    binsize = 100
    territory_size = 15000

    # read gene coordinates
    geneinfos = {}
    for x in CSV.DictReader(IOTools.openFile(infile), dialect='excel-tab'):
        contig, strand, start, end = x['contig'], x[
            'strand'], int(x['start']), int(x['end'])
        geneinfos[x['gene_id']] = (contig, strand,
                                   start, end)

    infiles = [infile + ".readextension_upstream_sense.tsv.gz",
               infile + ".readextension_downstream_sense.tsv.gz"]

    outdir = os.path.join(PARAMS["exportdir"], "utr_extension")

    R('''suppressMessages(library(RColorBrewer))''')
    R('''suppressMessages(library(MASS))''')
    R('''suppressMessages(library(HiddenMarkov))''')

    # for upstream, downstream
    upstream_utrs, downstream_utrs = {}, {}

    all_genes = set()

    for filename, new_utrs in zip(infiles, (upstream_utrs, downstream_utrs)):

        E.info("processing %s" % filename)

        parts = os.path.basename(filename).split(".")

        data = R(
            '''data = read.table( gzfile( "%(filename)s"), header=TRUE, fill=TRUE, row.names=1)''' % locals() )

        ##########################################
        ##########################################
        ##########################################
        # estimation
        ##########################################
        # take only those with a 'complete' territory
        R('''d = data[-which( apply( data,1,function(x)any(is.na(x)))),]''')
        # save UTR
        R('''utrs = d$utr''' )
        # remove length and utr column
        R('''d = d[-c(1,2)]''')
        # remove those which are completely empty, logtransform or scale data
        # and export
        R('''lraw = log10( d[-which( apply(d,1,function(x)all(x==0))),] + 1 )''')

        utrs = R('''utrs = utrs[-which( apply(d,1,function(x)all(x==0)))]''' )
        scaled = R(
            '''lscaled = t(scale(t(lraw), center=FALSE, scale=apply(lraw,1,max) ))''' )
        exons = R('''lraw[,1]''')

        #######################################################
        #######################################################
        #######################################################
        # do the estimation:
        E.debug("estimation: utrs=%i, exons=%i, vals=%i, dim=%s" %
                (len(utrs), len(exons), len(scaled), R.dim(scaled)))
        # counts within and outside UTRs
        within_utr, outside_utr, otherTranscript = [], [], []
        # number of transitions between utrs
        transitions = numpy.zeros((3, 3), numpy.int)

        for x in xrange(len(utrs)):
            utr, exon = utrs[x], exons[x]

            # only consider genes with expression coverage
            # note: expression level is logscaled here, 10^1 = 10
            if exon < 0.1:
                continue

            # first row is column names, so x + 1
            values = list(scaled.rx(x + 1, True))

            utr_bins = utr // binsize
            nonutr_bins = (territory_size - utr) // binsize

            # build transition matrix
            transitions[0][0] += utr_bins
            transitions[0][1] += 1
            transitions[1][1] += nonutr_bins

            outside_utr.extend([x for x in values[utr_bins:] if x <= 0.5])

            # ignore exon and zero counts
            within_utr.extend([x for x in values[1:utr_bins] if x > 0.1])

            # add only high counts to otherTranscript emissions
            otherTranscript.extend([x for x in values[utr_bins:] if x > 0.5])

        # estimation for
        # 5% chance of transiting to otherTranscript
        transitions[1][2] = transitions[1][1] * 0.05
        # 10% chance of remaining in otherTranscript
        transitions[2][1] = 900
        transitions[2][2] = 100

        E.info("counting: (n,mean): within utr=%i,%f, outside utr=%i,%f, otherTranscript=%i,%f" %
               (len(within_utr), numpy.mean(within_utr),
                len(outside_utr), numpy.mean(outside_utr),
                len(otherTranscript), numpy.mean(otherTranscript)))

        ro.globalenv['transitions'] = R.matrix(transitions, nrow=3, ncol=3)
        R('''transitions = transitions / rowSums( transitions )''')
        ro.globalenv['within_utr'] = ro.FloatVector(within_utr[:10000])
        ro.globalenv['outside_utr'] = ro.FloatVector(outside_utr[:10000])
        ro.globalenv['otherTranscript'] = ro.FloatVector(
            otherTranscript[:10000])

        # estimate beta distribution parameters
        R('''doFit = function( data ) {
                   data[data == 0] = data[data == 0] + 0.001
                   data[data == 1] = data[data == 1] - 0.001
                   f = fitdistr( data, dbeta, list( shape1=0.5, shape2=0.5 ) )
                   return (f) }''' )

        fit_within_utr = R(
            '''fit_within_utr = suppressMessages(doFit( within_utr))''' )
        fit_outside_utr = R(
            '''fit_outside_utr = suppressMessages(doFit( outside_utr))''' )
        fit_other = R(
            '''fit_otherTranscript = suppressMessages(doFit( otherTranscript))''' )

        within_a, within_b = list(fit_within_utr.rx("estimate"))[0]
        outside_a, outside_b = list(fit_outside_utr.rx("estimate"))[0]
        other_a, other_b = list(fit_other.rx("estimate"))[0]

        E.info("beta estimates: within_utr=%f,%f outside=%f,%f, other=%f,%f" %
               (within_a, within_b, outside_a, outside_b, other_a, other_b))

        fn = ".".join((parts[0], parts[4], "fit", "png"))
        outfilename = os.path.join(outdir, fn)
        R.png(outfilename, height=1000, width=1000)

        R( '''par(mfrow=c(3,1))''' )
        R( '''x=seq(0,1,0.02)''')
        R( '''hist( within_utr, 50, col=rgb( 0,0,1,0.2) )''' )
        R( '''par(new=TRUE)''')
        R(
            '''plot( x, dbeta( x, fit_within_utr$estimate['shape1'], fit_within_utr$estimate['shape2']), type='l', col='blue')''')

        R( '''hist( outside_utr, 50, col=rgb( 1,0,0,0.2 ) )''' )
        R( '''par(new=TRUE)''')
        R(
            '''plot( x, dbeta( x, fit_outside_utr$estimate['shape1'], fit_outside_utr$estimate['shape2']), type='l', col='red')''')

        R( '''hist( otherTranscript, 50, col=rgb( 0,1,0,0.2 ) )''' )
        R( '''par(new=TRUE)''')
        R(
            '''plot( x, dbeta( x, fit_otherTranscript$estimate['shape1'], fit_otherTranscript$estimate['shape2']), type='l', col='green')''')
        R['dev.off']()

        #####################################################
        #####################################################
        #####################################################
        # build hmm
        # state 1 = UTR
        # state 2 = notUTR
        # state 3 = other transcript
        p = R('''betaparams = list( shape1=c(fit_within_utr$estimate['shape1'],
                                         fit_outside_utr$estimate['shape1'],
                                         fit_otherTranscript$estimate['shape1']),
                                shape2=c(fit_within_utr$estimate['shape2'],
                                         fit_outside_utr$estimate['shape2'],
                                         fit_otherTranscript$estimate['shape2'])) ''')
        R('''hmm = dthmm(NULL, transitions, c(1,0,0), "beta", betaparams )''' )

        E.info("fitting starts")
        #####################################################
        #####################################################
        #####################################################
        # fit to every sequence
        genes = R('''rownames(data)''')
        all_genes.update(set(genes))
        utrs = R('''data$utr''')
        exons = R('''data$exon''')
        nseqs = len(utrs)

        counter = E.Counter()

        for idx in xrange(len(utrs)):

            gene_id = genes[idx]

            old_utr = utrs[idx]

            if idx % 100 == 0:
                E.debug("processing gene %i/%i" % (idx, len(utrs)))

            counter.input += 1

            # do not predict if terminal exon not expressed
            if exons[idx] < 1:
                counter.skipped_notexpressed += 1
                new_utrs[gene_id] = Utr._make(
                    (old_utr, None, None, "notexpressed"))
                continue

            R('''obs = data[%i,][-c(1,2)]''' % (idx + 1) )
            # remove na
            obs = R('''obs = obs[!is.na(obs)]''' )
            if len(obs) <= 1 or max(obs) == 0:
                new_utrs[gene_id] = Utr._make(
                    (old_utr, None, None, "no observations"))
                continue

            # normalize
            R('''obs = obs / max(obs)''')
            # add small epsilon to 0 and 1 values
            R('''obs[obs==0] = obs[obs==0] + 0.001 ''')
            R('''obs[obs==1] = obs[obs==1] - 0.001 ''')
            R('''hmm$x = obs''')

            states = None
            try:
                states = list(R('''states = Viterbi( hmm )'''))
            except ri.RRuntimeError, msg:
                counter.skipped_error += 1
                new_utrs[gene_id] = Utr._make((old_utr, None, None, "fail"))
                continue

            max_utr = binsize * (len(states) - 1)

            # subtract 1 for last exon
            try:
                new_utr = binsize * (states.index(2) - 1)
                new_utrs[gene_id] = Utr._make(
                    (old_utr, new_utr, max_utr, "ok"))
                counter.success += 1
            except ValueError:
                new_utrs[gene_id] = Utr._make(
                    (old_utr, max_utr, max_utr, "max"))
                counter.maxutr += 1
Пример #27
0
def r_matrix(x, rows):
	# Create R matrix from Python Array type.
	m = r.matrix(ro.FloatVector(x), nrow=rows)
	return m
Пример #28
0
from rpy2.robjects import r
import numpy as np
from rpy2.robjects import numpy2ri

numpy2ri.activate()

x = r.matrix(np.array(range(9)), nrow=3, ncol=3)
r.assign('x', x)
r('print(x)')
Пример #29
0
 def exprs(self, array):
     exprs_set = r('`exprs<-`')
     mat = r.matrix(array, nrow=array.shape[0], ncol=array.shape[1])
     exprs_set(self.ExpressionSet, mat)
Пример #30
0
def run(args):
    """ Main function 
    :param  args: args from the command line
    """
    # argument reading
    # index of starting task
    nstart = int(args.start_gene_index)

    # index of ending task
    nend = int(args.end_gene_index)

    # get current dir
    cur_dir = os.getcwd()

    # single mask dir
    single_mask_dir = args.input_folder

    # info file
    info_file = args.gene_info

    # database dir
    db_dir = args.weight_db

    # covariance dir
    cov_dir = args.cov_dir

    # output dir
    out_dir = args.output_dir

    # read list of genes
    gene_info = pd.read_table(info_file)

    # output name
    output_name = args.output_name

    # r interface
    r_requirement()
    rpy2.robjects.numpy2ri.activate()
    importr("GBJ")

    P = nend - nstart + 1
    gene_ensg = gene_info["gene_ensg"].copy()
    gene_id = gene_info["gene_ensg"].copy()
    gene_name = gene_info["gene_ensg"].copy()

    # read z-score file
    logging.info("Read in z-score files")

    # directory of z-score
    os.chdir(single_mask_dir)

    # search for files ending with .csv
    fi = []
    fi_sqtl = []

    for file in sorted(os.listdir("./")):
        if file.endswith(".csv"):
            fi.append(file)
        if file.endswith("_sqtl.csv"):
            fi_sqtl.append(file)
    logging.info(str(len(fi)) + " files in total.")
    N = len(fi)

    # index of sqtl results (3 tissues)
    indi2 = match_list(fi_sqtl, fi)

    # index of eqtl results (47 tissues)
    indi1 = np.delete(np.arange(0, N), indi2)

    zscore_dict = {}
    for i in range(N):
        nam = "zscore_" + str(i + 1)
        zscore_dict[nam] = pd.read_csv(fi[i], header="infer")

    # output file: list of test score and p-value
    logging.info("compute p-value for genes")
    #directory of db
    os.chdir(db_dir)
    # initialize the outcome matrix
    outcome = pd.DataFrame(np.zeros(shape=(P, N + 5)))
    outcome.iloc[:, :] = np.nan
    outcome.loc[:, 0] = gene_id[(nstart - 1):nend].values
    outcome.loc[:, 1] = gene_name[(nstart - 1):nend].values
    outcome = outcome.rename(columns={0: "gene_id", 1: "gene_name"})

    # read the database
    fi = []
    for file in sorted(os.listdir(db_dir)):
        if file.endswith(".db"):
            fi.append(file)

    # calculation
    for k in range(P):
        logging.info("Gene: " + str(k + nstart))
        gene = gene_ensg[k + nstart - 1]
        print(gene)
        #read snp list
        #snp_rsid

        try:
            filename = cov_dir + "/" + gene + ".snplist"
            snp_rsid = pd.read_table(filename, header=None)
        except:
            continue
        snp_rsid = list(snp_rsid.loc[:, 0])

        # matrix of weights
        # number of snps
        M = len(snp_rsid)
        logging.info("Number of SNPs: " + str(M))

        # weights1: matrix of eqtl tissues
        weights1 = np.zeros(shape=(M, len(indi1)))
        for i in range(len(indi1)):
            #logging.info("Database: " + str(i+1))
            dbname = fi[indi1[i]]
            conn = create_connection(dbname)
            cur = conn.cursor()
            sql_q = 'select * from weights where gene = "' + gene + '"'
            tmp_query = cur.execute(sql_q).fetchall()
            rsid_in_db = list(map(lambda x: str(x[0]), tmp_query))
            #rsid_in_db = map(lambda x: str(x[0]), tmp_query)
            index = match_list(rsid_in_db, snp_rsid)
            indi = index[index > -1]
            # extract the weight
            tmp_weights = np.array(list(map(lambda x: str(x[2]), tmp_query)))
            #tmp_weights = np.array(map(lambda x: str(x[2]), tmp_query))
            if sum(index > -1) > 0:
                weights1[indi, i] = tmp_weights[index > -1]

        # weights2: matrix of sqtl tissues (each intron is regarded as a separate tissue)
        weights2 = np.empty((M, 0))
        intron_name = {}
        for i in range(len(indi2)):
            #logging.info("Database: " + str(i+1))
            dbname = fi[indi2[i]]
            conn = create_connection(dbname)
            cur = conn.cursor()
            sql_q = "select * from weights where gene LIKE '" + gene + "!_%'" + " ESCAPE '!'"
            tmp_query = cur.execute(sql_q).fetchall()
            tmp_intron_name = list(map(lambda x: str(x[1]), tmp_query))
            #tmp_intron_name = map(lambda x: str(x[1]), tmp_query)
            intron_name[i] = np.unique(tmp_intron_name)
            L = len(intron_name[i])
            weights = np.zeros(shape=(M, L))
            if L > 0:
                for j in range(L):
                    sql_q = 'select * from weights where gene = "' + intron_name[
                        i][j] + '"'
                    tmp_query = cur.execute(sql_q).fetchall()
                    # extract the rsid for certain intron
                    rsid_in_db = list(map(lambda x: str(x[0]), tmp_query))
                    #rsid_in_db = map(lambda x: str(x[0]), tmp_query)
                    index = match_list(rsid_in_db, snp_rsid)
                    indi = index[index > -1]
                    tmp_weights = np.array(
                        list(map(lambda x: str(x[2]), tmp_query)))
                    # extract the weight
                    if sum(index > -1) > 0:
                        weights[indi, j] = tmp_weights[index > -1]
            weights2 = np.hstack((weights2, weights))

        weights_f = np.hstack((weights1, weights2))
        # covariance matrix of snps
        cov_file = cov_dir + "/" + gene_id[k + nstart - 1] + ".cov"
        cov_matrix = np.loadtxt(cov_file)

        # covariance matrix of gene in different tissue
        cov_gene = np.mat(weights_f.T) * np.mat(cov_matrix) * np.mat(weights_f)
        cov_gene = np.array(cov_gene)
        # normalization
        ncol = cov_gene.shape[1]
        for i in range(ncol):
            if cov_gene[i, i] != 0:
                cov_gene[i, :] = cov_gene[i, :] / np.sqrt(cov_gene[i, i])
                cov_gene[:, i] = cov_gene[:, i] / cov_gene[i, i]

        ## zscore_gene1: z-score of eqtl tissues
        zscore_gene1 = np.empty(len(indi1))
        for i in range(len(indi1)):
            nam = "zscore_" + str(indi1[i] + 1)
            index = zscore_dict[nam]["gene"] == gene
            if sum(index) > 0:
                zscore_gene1[i] = zscore_dict[nam]["zscore"][index].values[0]
                #p-value
                outcome.loc[k, (i + 5)] = float(
                    zscore_dict[nam]["pvalue"][index].values[0])
            else:
                zscore_gene1[i] = np.nan

        ## zscore_gene2: z-score of sqtl tissues (each matched intron has a z-score including NA)
        zscore_gene2 = np.array([])
        pvalue_gene2 = np.array([])
        for i in range(len(indi2)):
            intron = intron_name[i]
            nam = "zscore_" + str(indi2[i] + 1)
            if len(intron) != 0:
                for j in range(len(intron)):
                    index = zscore_dict[nam]["gene"] == intron[j]
                    if sum(index) > 0:
                        tmp_zscore_gene = zscore_dict[nam]["zscore"][
                            index].values[0]
                        tmp_pvalue_gene = zscore_dict[nam]["pvalue"][
                            index].values[0]
                    else:
                        tmp_zscore_gene = np.nan
                        tmp_pvalue_gene = np.nan
                    zscore_gene2 = np.append(zscore_gene2, tmp_zscore_gene)
                    pvalue_gene2 = np.append(pvalue_gene2, tmp_pvalue_gene)
        ##matrix of zscores for all eqtl and sqtl tissues (the same dimension with cov_gene matrix)
        zscore_gene = np.concatenate((zscore_gene1, zscore_gene2))
        #only keep tissues with prediction model for gene
        index = np.isnan(zscore_gene) == False
        if sum(index) > 1:
            zscore_gene = zscore_gene[index]
            cov_gene = cov_gene[index, :][:, index]
        elif sum(index) == 1:
            _tmp_index = np.argmax(np.isnan(zscore_gene) == False)
            _tmp_zscore = zscore_gene[_tmp_index]
            if _tmp_index < len(indi1):
                _tmp_pvalue = outcome.loc[k, _tmp_index + 5]
            else:
                _tmp_pvalue = pvalue_gene2[_tmp_index - len(indi1)]
            outcome.loc[k, 2] = _tmp_zscore
            outcome.loc[k, 3] = _tmp_pvalue
            continue
        else:
            # test cannot be done
            continue
        # check if the matrix is symmetric
        r_issymmetric = r['isSymmetric']
        r_cov_gene = r.matrix(cov_gene, nrow=cov_gene.shape[0])
        if r_issymmetric(r_cov_gene)[0]:
            # GBJ
            # convert the python object to r object
            r_zscore_gene = r.matrix(zscore_gene)
            # run the test
            GBJ_res = r["GBJ"](test_stats=r_zscore_gene, cor_mat=r_cov_gene)
            # output the test result to the result matrix
            outcome.loc[k, 2] = GBJ_res.rx2("GBJ")[0]
            print(GBJ_res.rx2("GBJ")[0])
            outcome.loc[k, 3] = GBJ_res.rx2("GBJ_pvalue")[0]
            print(GBJ_res.rx2("GBJ_pvalue")[0])
    # output the results
    os.chdir(out_dir)
    output_df = outcome.iloc[:, 1:4]
    filename = output_name + "_" + str(nstart) + "_" + str(nend) + ".txt"
    output_df.to_csv(filename,
                     na_rep='NA',
                     header=["gene", "test_score", "p_value"],
                     index=None,
                     sep='\t',
                     mode='w')
Пример #31
0
def run(args):
    """ Main function 
    :param  args: args from the command line
    """
    # argument reading
    # index of starting task
    nstart = int(args.start_gene_index)
    
    # index of ending task
    nend = int(args.end_gene_index)
    
    # single mask dir
    single_mask_dir = args.input_folder
    
    # info file
    info_file = args.gene_info
    
    # database dir
    db_dir = args.weight_db
    
    # covariance dir 
    cov_dir = args.cov_dir
    
    # output dir 
    out_dir = args.output_dir
    
    
    # read list of genes
    gene_info = pd.read_table(info_file)
    
    # r interface
    r_requirement()
    rpy2.robjects.numpy2ri.activate()
    importr("GBJ")
    
    P = nend - nstart + 1
    gene_ensg = gene_info["gene_ensg"].copy()
    gene_id = gene_info["gene_ensg"].copy()
    gene_name = gene_info["gene_ensg"].copy()
    
    #read z-score file
    logging.info("Read in z-score files")
    
    #directory of z-score
    os.chdir(single_mask_dir) 
    
    # search for files ending with .csv
    fi = []
    
    for file in sorted(os.listdir(single_mask_dir)):
        if file.endswith(".csv"):
            fi.append(file)
    logging.info(str(len(fi)) + " files in total.")
    N = len(fi)
    
    zscore_dict = {}
    for i in range(N):
        nam = "zscore_" + str(i+1)
        zscore_dict[nam] = pd.read_csv(fi[i], header = "infer")
    
    
    #======
    #output file: list of test score and p-value
    logging.info("compute p-value for genes")
    #directory of db
    os.chdir(db_dir) 
    # initialize the outcome matrix
    outcome = pd.DataFrame(np.zeros(shape =(P,48)))
    outcome.loc[:,0] = gene_id[(nstart-1):nend]
    outcome.loc[:,1] = gene_name[(nstart-1):nend]
    outcome = outcome.rename(columns={0:"gene_id",1:"gene_name"})
   
    # read the database 
    fi = []
    for file in sorted(os.listdir(db_dir)):
        if file.endswith(".db"):
            fi.append(file)
            
    # calculation        
    for k in range(P):
        logging.info("Gene: " + str(k + nstart))
        gene = gene_ensg[k + nstart -1]
        print(gene)
        #read snp list
        #snp_rsid
        
        try:
            filename = cov_dir + gene + ".snplist"
            snp_rsid = pd.read_table(filename, header = None)
        except:
            continue
        snp_rsid = list(snp_rsid.loc[:,0])
        
        #matrix of weights
        M = len(snp_rsid) #number of snps
        logging.info("Number of SNPs: " + str(M))
        weights = np.zeros(shape = (M, N))
        for i in range(N):
            #logging.info("Database: " + str(i+1))
            dbname = fi[i]
            conn = create_connection(dbname)
            cur = conn.cursor()  
            sql_q = 'select * from weights where gene = "' + gene + '"'
            tmp_query = cur.execute(sql_q).fetchall()
            rsid_in_db = list(map(lambda x: str(x[0]), tmp_query))
            #rsid_in_db = map(lambda x: str(x[0]), tmp_query)
            index = match_list(rsid_in_db, snp_rsid)
            indi = index[index > -1]
            #print(index)
            # extract the weight
            sql_q = 'select * from weights where gene = "' + gene + '"'
            tmp_query = cur.execute(sql_q).fetchall()
            tmp_weights = np.array(list(map(lambda x: str(x[2]), tmp_query)))
            #tmp_weights = np.array(map(lambda x: str(x[2]), tmp_query))
            if sum(index) > 0:
                weights[indi,i] = tmp_weights[index > -1]
            
        # covariance matrix of snps
        cov_file = cov_dir + gene_id[k + nstart - 1] + ".cov"
        cov_matrix = np.loadtxt(cov_file)
 
        # covariance matrix of gene in different tissue
        cov_gene = np.mat(weights.T) * np.mat(cov_matrix) * np.mat(weights)
        cov_gene = np.array(cov_gene)

        # normalization
        for i in range(N):
            if cov_gene[i,i] != 0:
                cov_gene[i,:] = cov_gene[i,:] / np.sqrt(cov_gene[i,i])
                cov_gene[:,i] = cov_gene[:,i] / cov_gene[i,i]
        
        #z-score of gene in different tissue
        zscore_gene = np.full([N, 1], np.nan)   
        for i in range(N):
            nam = "zscore_" + str(i+1)
            index = zscore_dict[nam]["gene"] == gene
            if sum(index) > 0:
                zscore_gene[i] = zscore_dict[nam]["zscore"][index].values[0]
                #p-value
                outcome.loc[k, (i+4)] = float(zscore_dict[nam]["pvalue"][index].values[0])
                  
        #only keep tissues with prediction model for gene
        index = np.isnan(zscore_gene) == False
        indext = index.T[0]
        if sum(index) > 0:
            zscore_gene = zscore_gene[index]
            cov_gene = cov_gene[indext,:][:,indext]
        else:
            # test cannot be done
            continue
        # check if the matrix is symmetric
        if np.allclose(cov_gene,cov_gene.T):
            # GBJ
            # convert the python object to r object
            r_zscore_gene = r.matrix(zscore_gene)
            r_cov_gene = r.matrix(cov_gene, nrow = cov_gene.shape[0])
            # run the test            
            GBJ_res = r["GBJ"](test_stats=r_zscore_gene, cor_mat=r_cov_gene)
            # output the test result to the result matrix
            outcome.loc[k, 2] = GBJ_res.rx2("GBJ")[0]
            print(GBJ_res.rx2("GBJ")[0])
            outcome.loc[k, 3] = GBJ_res.rx2("GBJ_pvalue")[0]
            print(GBJ_res.rx2("GBJ_pvalue")[0])
    # output the results
    os.chdir(out_dir)
    filename = "outcome_" + str(nstart) + "_" + str(nend) + ".txt"
    outcome.to_csv(filename, header=None, index=None, sep='\t', mode='w')
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: r_mann_whitney_u.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-m", "--method", dest="method", type="string",
                      help="method to use [ks=Kolmogorov-Smirnov,mwu=Mann-WhitneyU]")
    parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string",
                      help="write hardcopy to file.", metavar="FILE")
    parser.add_option("-1", "--infile1", dest="filename_input1", type="string",
                      help="input filename for distribution 1.")
    parser.add_option("-2", "--infile2", dest="filename_input2", type="string",
                      help="input filename for distribution 2.")
    parser.add_option("-p", "--infile-map", dest="filename_input_map", type="string",
                      help="input filename for mapping categories to values.")

    parser.set_defaults(
        method="ks",
        filename_input1=None,
        filename_input2=None,
        filename_input_map=None,
    )

    (options, args) = E.start(parser,
                              add_pipe_options=True)

    map_category2value = {}
    if options.filename_input_map:
        map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"),
                                             map_functions=(str, float))

    values1, errors1 = IOTools.ReadList(open(options.filename_input1, "r"),
                                        map_category=map_category2value)
    values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"),
                                        map_category=map_category2value)

    E.info("ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i" % (len(values1), len(errors1),
                                                                 len(values2), len(errors2)))

    if options.hardcopy:
        R.png(options.hardcopy, width=1024, height=768)

    if options.method == "ks":
        result = R.ks_test(values1, values2)
    elif options.method == "mwu":
        result = R.wilcox_test(values1, values2, paired=False)

    R.assign("v1", values1)
    R.assign("v2", values2)

    R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True))

    R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot")

    R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""")

    R("""hist( v1, freq=FALSE, width=0.5, density=10, main='Relative frequency histogram')""")
    R("""hist( v2, freq=FALSE, add=TRUE,   width=0.5, col='red', offset=0.5, density=20, angle=135)""")
    R("""hist( v1, freq=TRUE,  width=0.5, density=10, main='Absolute frequency histogram')""")
    R("""hist( v2, freq=TRUE,  add=TRUE,   width=0.5, col='red', offset=0.5, density=20, angle=135)""")

    print("## Results for %s" % result['method'])
    for x in ['p.value', 'statistic', 'alternative', 'method']:
        print(x, result[x])

    E.stop()
Пример #33
0
def siteSubsetGGSeqLogo(logodata, chars, plotfile, width, height,
        yname='', char_colors=AA_COLORS_FG, ylimits=None):
    """Creates one-row logo plot with subset of sites.

    Designed to show logo plot for a subset of sites. This
    is useful when you have data for many sites, but only
    want to look at a few of them. 

    Args:
        `logodata` (pandas DataFrame)
            Contains data to plot. Should have the columns
            `site`, `show`, and a column giving the height
            height of each char in `chars`. Only sites
            where `show` is `True` are shown. Sites are 
            shown in the order they occur in this dataframe,
            with spaces every time there is an interspersed
            site with `show` being `False`. 
        `chars` (list)
            Letters for which we plot heights.
        `plotfile` (str)
            Name of created plot.
        `width` (float)
            Width of plot in inches.
        `height` (float)
            Height of plot in inches.
        `yname` (str)
            If set to a non-empty string, is the y-axis label
            and yticks are drawn.
        `char_colors` (dict)
            Values give color for every character in `chars`.
        `ylimits` (`None` or 2-tuple)
            If not `None`, should give the ylimits for the plot
            as `(ymin, ymax)`

    Here is an example that creates a plot for a subset of
    sites for two characters:

    >>> logodata = pandas.read_csv(io.StringIO(
    ...     '''site show    A    C
    ...        A101 True  0.8  0.2
    ...        N102 True  0.7  0.3
    ...        K103 False 0.1  0.9
    ...        L104 True  0.8  0.2
    ...        S105 True  0.5  0.5
    ...        T106 False 0.2  0.8
    ...        G107 False 0.4  0.6
    ...        L108 True  0.7  0.3'''),
    ...     delim_whitespace=True, index_col=False)
    >>> plotfile = '_siteSubsetGGSeqLogo_test_plot.png'
    >>> siteSubsetGGSeqLogo(logodata,
    ...         chars=['A', 'C'],
    ...         plotfile=plotfile,
    ...         width=3.5, height=2
    ...         )
    >>> os.path.isfile(plotfile)
    True

    Here is the plot created by the code block above:

    .. image:: _static/_siteSubsetGGSeqLogo_test_plot.png
       :width: 55%
       :align: center

    """
    if os.path.isfile(plotfile):
        os.remove(plotfile)

    assert set(chars) <= set(char_colors.keys()), \
            "`char_colors` not defined for all chars"

    expectcol = ['site', 'show'] + chars
    assert set(logodata.columns) >= set(expectcol), \
            "`logodata` needs these column: {0}".format(expectcol)

    assert logodata['show'].any(), "no sites to show"

    # for each consecutive set of rows not to show, keep just one
    logodata = logodata[expectcol]
    logodata['keeprow'] = (
            ((logodata['show']) | 
                (logodata['show'] != logodata['show'].shift(1)))
            )
    logodata = logodata.query('keeprow').reset_index()

    # trim first and last row if they are not to be shown
    if not logodata.iloc[0]['show']:
        logodata = logodata.iloc[1 : ].reset_index()
    if not logodata.iloc[-1]['show']:
        logodata = logodata.iloc[ : -1]

    # set site label to empty and data to zero for rows not to show
    logodata.loc[~logodata['show'], 'site'] = ''
    logodata.loc[~logodata['show'], chars] = 0
    vertlines = logodata.query('~show').index.values + 1

    # generate matrix to plot
    sites = logodata['site']
    matrix = r.matrix(logodata.set_index('site')[chars].values.ravel(),
            ncol=len(sites),
            dimnames=[chars, sites]
            )

    if ylimits is None:
        ylimits = rinterface.NULL
    else:
        ylimits = FloatVector(ylimits)

    # make the plot
    with warnings.catch_warnings():
        warnings.simplefilter(SHOW_WARNINGS)
        _RFUNCS.siteSubsetGGSeqLogo(
                mat=matrix,
                plotfile=plotfile,
                width=width,
                height=height,
                xlabels=list(map(str, sites)),
                vertlines=vertlines,
                yname=yname,
                chars=StrVector(chars),
                char_colors=StrVector([char_colors[x] for x in chars]),
                ylimits=ylimits
                )

    if not os.path.isfile(plotfile):
        raise RuntimeError("failed to create {0}".format(plotfile))