def fit(self, x, y): """ Constructs GAM model(s) to predict y from X x: 1 or 2 dimensional array of predictor values with each row being one observation y: 1 or 2 dimensional array of predicted values (a GAM model is constructed for each output if y is 2 dimensional) """ # Input validation for standard estimators using sklearn utils x, y = check_X_y(x, y, accept_sparse=["csr", "csc", "coo"], multi_output=True) # Convert to R matrices if ( x.ndim == 1 ): # If we're only looking at 1 x at a time, shape[1] will give an error for one-dimensional arrays. Sklearn input validation doesn't change that. rX = r.matrix(x, nrow=x.shape[0], ncol=1) else: rX = r.matrix(x, nrow=x.shape[0], ncol=x.shape[1]) if ( y.ndim == 1 ): # If we're only looking at 1 y at a time, shape[1] will give an error for one-dimensional arrays rY = r.matrix(y, nrow=y.shape[0], ncol=1) else: rY = r.matrix(y, nrow=y.shape[0], ncol=y.shape[1]) # Compute models (one for each column in y) self.gammodels = self.computeGAM(rX, rY) return self
def predict(self, x): """ Get the predicted values for the given input values Returns an nxm np.array with the predicted y values corresponding to the given x, with m being the number of dependent variables and n the number of observations in x NOTE: assumes that the dimensions for the predicted y are the same as what was expected from the training, e.g. same amount of dependent variables """ ## Check the estimator has been fit before calling this function check_is_fitted(self, "gammodels") # input is converted to an at least 2nd numpy array by sklearn util function, this is necessary for handling 1-dimensional x inputs etc. correctly (otherwise also doesn't convert to right amount of columns and rows) x = check_array(x, accept_sparse=["csr", "csc", "coo"]) # Convert to R matrices if ( x.ndim == 1 ): # If we're only looking at 1 x at a time, shape[1] will give an error for one-dimensional arrays. Sklearn input validation doesn't change that. rx = r.matrix(x, nrow=x.shape[0], ncol=1) else: rx = r.matrix(x, nrow=x.shape[0], ncol=x.shape[1]) r.assign("newxdata", rx) # Put data in R environment for the functions to use r("newxdataframe<-data.frame(newxdata)") # Use gammodels list to predict each dependent variable and put together in R matrix for i, gammodel in enumerate(self.gammodels): r.assign("gmodel", gammodel) if i == 0: # array is empty r("predmatrix<-predict(gmodel, newxdataframe)") else: r("predmatrix<-cbind(predmatrix,predict(gmodel,newxdataframe))") result = np.asarray(r["predmatrix"]) return result
def fit_glm(data, family='binomial', glmnet_kwargs=None): """ :param data: the data dictionary :param family: response type :param glmnet_kwargs: dictionary of keyword arguments to pass the glmnet function in R. :return: pandas dataframe containing the fit model parameters. Each row corresponds to a unique value for lambda. """ if not packages.isinstalled(name='glmnet'): utils = packages.importr('utils') utils.chooseCRANmirror(ind=1) utils.install_packages('glmnet') if not glmnet_kwargs: glmnet_kwargs = {} # intercept should be added as a constant 1 feature, not via glmnet. Thus, always set to 'False'. if 'intercept' in glmnet_kwargs.keys(): assert glmnet_kwargs['intercept'] is False, \ "Do not add intercept in glmnet. Please add an intercept feature to the dataset instead." glmnet_kwargs['intercept'] = False # set default parameters if 'alpha' not in glmnet_kwargs.keys(): glmnet_kwargs['alpha'] = DEFAULT_ALPHA if 'nlambda' not in glmnet_kwargs.keys(): glmnet_kwargs['nlambda'] = DEFAULT_NLAMBDA # R set-up numpy2ri.activate() glmnet = importr('glmnet') n_row, n_col = data['X'].shape # transfer to R objects x_ = r.matrix(data['X'], nrow=n_row) y_ = r.matrix(data['Y'], nrow=n_row) weights = FloatVector(data['sample_weights']) output = glmnet.glmnet(x=x_, y=y_, family=family, weights=weights, **glmnet_kwargs) coefs = r.t(sparse_to_full_matrix(output.rx2('beta'))) coefs = np.array(coefs) # drop added intercept column which is fixed at 0 lambda_ = output.rx('lambda') lambda_ = np.array(r.matrix(lambda_))[0] coef_names = data['variable_names'] df = pd.DataFrame(coefs, columns=coef_names) df['lambda_'] = lambda_ df['alpha'] = glmnet_kwargs['alpha'] return df
def py2mat(myobj): """ Convert Python series to R matrix. """ if isinstance(myobj, pandas.Series): mat = r.matrix(myobj, rownames=myobj.index, dimnames=myobj.name) else: mat = r.matrix(myobj) return mat
def diagnostics(self, fn=None): """ Plot diagnostics for the regression. If `fn` is provided, then save the results to file. The filetype to be saved is determined by the extension. Additional kwargs are passed to the saving function (e.g., width=10) """ d = { '.pdf': grdevices.pdf, '.png': grdevices.png } if fn: ext = os.path.splitext(fn)[1] try: saver_func = d[ext] except KeyError: raise ValueError('extension "%s" not supported, ' 'please use one of %s' % (ext, d.keys())) saver_func(file=fn) r.layout(r.matrix([1, 2, 3, 4], 2, 2)) r.plot(self.lm) if fn: rclose() return
def ComBat(X, batch, covariate=None, parametric=False, empirical_bayes=True, save_dir=None): # Check X if not isinstance(X, (pd.DataFrame, pd.Series)): if isinstance(X, (list, tuple, np.ndarray, Mapping)): df = pd.DataFrame(X) else: raise TypeError('X must be an array-like object, dictionary or pandas Dataframe/Series') else: df = X row_names = df.index r_df = pandas2ri.py2ri(df) # Check covariate if covariate is None: covariate = np.ones((len(batch), 1)) else: if not isinstance(covariate, (list, tuple, np.ndarray)): if isinstance(covariate, pd.DataFrame) or isinstance(covariate, pd.Series): covariate = covariate.to_numpy() else: raise TypeError('covariate array must be an array like or pandas Dataframe/Series') else: covariate = np.array(covariate) if len(covariate.shape) == 1: covariate = covariate.reshape(-1, 1) elif len(covariate.shape) > 2: raise ValueError('covariate array must be 1D or 2D') nr, nc = covariate.shape r_covariate = r.matrix(covariate, nrow=nr, ncol=nc) # Check batch if not isinstance(batch, (list, tuple, np.ndarray)): if isinstance(batch, pd.DataFrame) or isinstance(batch, pd.Series): batch = batch.to_numpy() else: raise TypeError('batch array must be an array like or pandas Dataframe/Series') else: batch = np.array(batch) if len(batch.shape) != 1: if len(batch.shape) == 2 and batch.shape[1] == 1: batch.reshape(-1) else: raise ValueError('batch array must be 1D or 2D with second dimension equal to 1') if len(np.unique(batch)) <= 1: raise ValueError('batch array must have at least 2 classes') r_batch = Vector(batch) # cwd = os.path.dirname(sys.argv[0]) cwd = os.path.dirname(os.path.abspath(__file__)) r.setwd(cwd) # r.source('./Statistical_analysis/R_scripts/ComBat.R') r.source('./R_scripts/ComBat.R') r_dr_results = r.ComBat_harmonization(r_df, r_covariate, r_batch, parametric, empirical_bayes) R_object_dict = {} keys = r_dr_results.names for i in range(len(keys)): R_object_dict[keys[i]] = np.array(r_dr_results[i]) results = pd.DataFrame(R_object_dict) results.index = row_names if save_dir is not None: results.to_excel(os.path.join(save_dir, 'Features_ComBat.xlsx')) return results
def makeCT(x): nr, nc = x.shape xvec = robjects.FloatVector(x.reshape(x.size)) xr = r.matrix(xvec, nrow=nc, ncol=nr) df = basepy.data_frame(xr) ct = basepy.table(df) ct = replacezeroes(ct) return ct
def calc_kmeans(data, clusters, iterations): data_as_list = data.flatten() data_as_rmatrix = r.matrix(data_as_list, ncol = len(data[0]), byrow = True) cluster_indexes = array(r.kmeans(data_as_rmatrix, clusters, iter_max = iterations, nstart = 1, algorithm = "MacQueen")[0]) reordered_data = [] for i in range (1, clusters + 1): indexes = where(cluster_indexes == i)[0] for index in indexes: reordered_data.append(data[index]) return array(reordered_data)
def pool_f(n): red = inv_phi(phi(dg, n * 10)) deg_counts = inv_dg(red, sub, parallel=False) with open('data/deg_counts_sd/digamma_deg_' + str(n * 10) + '-comp.pkl', 'wb') as buff: pickle.dump(deg_counts, buff) rPath = 'data/deg_counts_sd/digamma_deg_' + str(n * 10) + '-comp.RDS' nr, nc = deg_counts.shape dg_r = r.matrix(deg_counts, nrow=nr, ncol=nc) r.saveRDS(dg_r, rPath)
def calc_kmeans_greedy(data, clusters, iterations): data_as_list = data.flatten() data_as_rmatrix = r.matrix(data_as_list, ncol=len(data[0]), byrow=True) cluster_indexes = array( r.kmeans(data_as_rmatrix, clusters, iter_max=iterations, nstart=1, algorithm="MacQueen")[0]) ordered_clusters = order_kcluster_contents(clusters, cluster_indexes, data) return order_clusters(ordered_clusters)
def train(self, X, w, y): self.X, self.w = X, w X = r.matrix(X, nrow=X.shape[0]) w = ro.FloatVector(w) y = ro.FloatVector(y) self.rr_model = self.grf.causal_forest( X, y, w, **{ "seed": self.seed, "num.trees": self.n_trees, "honesty": True, "alpha": 0.1, "min.node.size": 1 })
def calc_kmeans(data, clusters, iterations): data_as_list = data.flatten() data_as_rmatrix = r.matrix(data_as_list, ncol=len(data[0]), byrow=True) cluster_indexes = array( r.kmeans(data_as_rmatrix, clusters, iter_max=iterations, nstart=1, algorithm="MacQueen")[0]) reordered_data = [] for i in range(1, clusters + 1): indexes = where(cluster_indexes == i)[0] for index in indexes: reordered_data.append(data[index]) return array(reordered_data)
def weighted_silhouette(*, diss, lab, weight, **kwargs): from rpy2.robjects import r from rpy2.robjects.packages import importr from rpy2.robjects import numpy2ri numpy2ri.activate() # %% wc = importr('WeightedCluster') wsr = r['wcSilhouetteObs'] # %% # %% nr, nc = diss.shape rdiss = r.matrix(diss, nrow=nr, ncol=nc) rlab = r.array(lab.values) rtot = r.array(weight.values) res = np.array(wsr(rdiss, rlab, weights=rtot, measure="ASWw")) return res
def convert_to_r_data(data): # Input is sumu.Data init_r() numpy2ri.activate() datar = r.matrix(data.all().flatten(), nrow=data.N, ncol=data.n, byrow=True) numpy2ri.deactivate() discrete = data.discrete arities = True if data.arities is not False else False datar = r['datapath_or_matrix_to_numeric_dataframe'](datar, discrete=discrete, arities=arities) return datar
def auto_arima(endog, exog=None, freq=None): if freq is None: freq = 1 # endog_r = r.ts(pandas2ri.py2ri(endog), freq=freq) # if using more recent version of rpy2, py2ri was renamed to py2rpy # see reference: https://stackoverflow.com/questions/55990529/module-rpy2-robjects-pandas2ri-has-no-attribute-ri2py endog_r = r.ts(pandas2ri.py2rpy(endog), freq=freq) autoarima_args = { "seasonal": True, "stationary": False, "trace": True, "max.order": 20, "max.p": 20, "max.q": 20, "max.P": 20, "max.Q": 20, "max.D": 20, "max.d": 20, "start.p": 1, "start.q": 1, "start.P": 1, "start.Q": 1 } if exog is not None: # add noise to avoid rank-deficient error for exog scale = np.std(exog.values) z = scale * 1e-4 * np.random.randn(*exog.shape) exog_r = r.matrix(exog.values + z, nrow=exog.shape[0], ncol=exog.shape[1], dimnames=[[], exog.columns.tolist()]) fit_r = forecast.auto_arima(y=endog_r, xreg=exog_r, **autoarima_args) else: fit_r = forecast.auto_arima(y=endog_r, **autoarima_args) fit_dict = dict(fit_r.items()) # for proof of this order see last comment: # https://stats.stackexchange.com/questions/178577/how-to-read-p-d-and-q-of-auto-arima p, q, P, Q, s, d, D = list(fit_dict["arma"]) return (p, d, q), (P, D, Q, s)
def caldiffPvalues(self): print 'calcularing Statistic P_values(-log10)...','\t',time.strftime('%Y-%m-%d %A %H:%M:%S', time.localtime()) r['options'](warn = -1) i = 0 for ch in self.getChrlist(): self.P_values.setdefault(ch,array('f',[])) self.diffPoints.setdefault(ch,array('I',[])) self.length[ch] = min([len(self.signal[filename][ch]) for filename in self.signal]) mean = [float(sum(self.signal[filename][ch])/len(self.signal[filename][ch])) for filename in self.signal] for j in range(self.length[ch]): data = [self.signal[filename][ch][j] for filename in self.signal] data.extend(mean) test_data = r.matrix(FloatVector(data),ncol =2) test = r[self.method](test_data) if str(test).split()[-1] != 'NA': p_value = float(str(test).split()[-1]) self.P_values[ch].append(-log10(p_value)) if p_value < self.cutoff: self.diffPoints[ch].append(j) else: self.P_values[ch].append(0) if (j%10000 == 0):print "processed %i points for %s" % (j,ch),'\t',time.strftime('%Y-%m-%d %A %H:%M:%S', time.localtime()) i += 1 print "%s caldiff finished (Finished %i Chroms)" %(ch,i),'\t',time.strftime('%Y-%m-%d %A %H:%M:%S', time.localtime())
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: r_mann_whitney_u.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option( "-m", "--method", dest="method", type="string", help="method to use [ks=Kolmogorov-Smirnov,mwu=Mann-WhitneyU]") parser.add_option( "-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file.", metavar = "FILE" ) parser.add_option( "-1", "--infile1", dest="filename_input1", type="string" , help="input filename for distribution 1.") parser.add_option( "-2", "--infile2", dest="filename_input2", type="string" , help="input filename for distribution 2.") parser.add_option( "-p", "--infile-map", dest="filename_input_map", type="string" , help="input filename for mapping categories to values.") parser.set_defaults( method = "ks", filename_input1 = None, filename_input2 = None, filename_input_map = None, ) (options, args) = E.Start( parser, add_pipe_options = True, add_psql_options = True,) map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap( open(options.filename_input_map, "r"), map_functions=(str,float)) values1, errors1 = IOTools.ReadList( open(options.filename_input1, "r"), map_category=map_category2value ) values2, errors2 = IOTools.ReadList( open(options.filename_input2, "r"), map_category=map_category2value ) E.info( "ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i" % (len(values1), len(errors1), len(values2), len(errors2)) ) if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test( values1, values2 ) elif options.method == "mwu": result = R.wilcox_test( values1, values2, paired=False) R.assign("v1", values1) R.assign("v2", values2) R.layout(R.matrix((1,2,3,4), 2, 2, byrow = True)) R.boxplot( values1, values2, col=('white','red'), main="Boxplot" ) R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""") R("""hist( v1, freq=FALSE, width=0.5, density=10, main='Relative frequency histogram')""") R("""hist( v2, freq=FALSE, add=TRUE, width=0.5, col='red', offset=0.5, density=20, angle=135)""") R("""hist( v1, freq=TRUE, width=0.5, density=10, main='Absolute frequency histogram')""") R("""hist( v2, freq=TRUE, add=TRUE, width=0.5, col='red', offset=0.5, density=20, angle=135)""") print "## Results for %s" % result['method'] for x in ['p.value', 'statistic', 'alternative', 'method']: print x, result[x] E.Stop()
E.info( "ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i" % (len(values1), len(errors1), len(values2), len(errors2)) ) if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test( values1, values2 ) elif options.method == "mwu": result = R.wilcox_test( values1, values2, paired=False) R.assign("v1", values1) R.assign("v2", values2) R.layout(R.matrix((1,2,3,4), 2, 2, byrow = True)) R.boxplot( values1, values2, col=('white','red'), main="Boxplot" ) R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""") R("""hist( v1, freq=FALSE, width=0.5, density=10, main='Relative frequency histogram')""") R("""hist( v2, freq=FALSE, add=TRUE, width=0.5, col='red', offset=0.5, density=20, angle=135)""") R("""hist( v1, freq=TRUE, width=0.5, density=10, main='Absolute frequency histogram')""") R("""hist( v2, freq=TRUE, add=TRUE, width=0.5, col='red', offset=0.5, density=20, angle=135)""") print "## Results for %s" % result['method'] for x in ['p.value', 'statistic', 'alternative', 'method']: print x, result[x]
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: r_compare_distributions.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-m", "--method", dest="method", type="choice", help="method to use: ks=Kolmogorov-Smirnov, mwu=Mann-WhitneyU, shapiro=Shapiro-Wilk, paired-mwu=paired Mann-WhitneyU, paired-t=paired t-test [default=%default]", choices=("ks", "mwu", "shapiro", "paired-mwu", "paired-t")) parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file.", metavar="FILE") parser.add_option("-1", "--infile1", dest="filename_input1", type="string", help="input filename for distribution 1.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename for distribution 2.") parser.add_option("--plot-legend", dest="legend", type="string", help="legend for histograms.""") parser.add_option("-f", "--infile-map", dest="filename_input_map", type="string", help="input filename for mapping categories to values.") parser.add_option("-n", "--norm-test", dest="norm_test", action="store_true", help="""test if a set of values is normally distributed. Mean and variance are calculated from the data.""") parser.add_option("-b", "--num-bins", dest="num_bins", type="int", help="""number of bins (for plotting purposes only).""") parser.add_option("--bin-size", dest="bin_size", type="float", help="""bin size for plot.""") parser.add_option("--min-value", dest="min_value", type="float", help="""minimum_value for plot.""") parser.add_option("--max-value", dest="max_value", type="float", help="""maximum_value for plot.""") parser.add_option("--skip-plot", dest="plot", action="store_false", help="""skipping plotting.""") parser.add_option("--header-names", dest="header", type="string", help="""header of value column [default=%default].""") parser.add_option("--title", dest="title", type="string", help="""plot title [default=%default].""") parser.set_defaults( method="ks", filename_input1=None, filename_input2=None, filename_input_map=None, legend=None, norm_test=False, num_bins=0, legend_range="2,2", bin_size=None, min_value=None, plot=True, header="value", title=None, ) (options, args) = E.Start(parser, add_pipe_options=True) kwargs = {} xargs = [] for arg in args: if "=" in arg: key, value = arg.split("=") kwargs[key] = value else: xargs.append(arg) if options.legend: options.legend = options.legend.split(",") map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"), map_functions=(str, float)) f = str else: f = float if options.filename_input1: infile1 = IOTools.openFile(options.filename_input1, "r") else: infile1 = sys.stdin values1, errors1 = IOTools.ReadList(infile1, map_function=f, map_category=map_category2value) if options.filename_input1: infile1.close() if errors1 and options.loglevel >= 3: options.stdlog.write("# errors in input1: %s\n" % ";".join(map(str, errors1))) if options.norm_test: mean = R.mean(values1) stddev = R.sd(values1) options.stdlog.write("# creating %i samples from normal distribution with mean %f and stddev %f\n" % ( len(values1), mean, stddev)) values2 = R.rnorm(len(values1), mean, stddev) errors2 = () else: values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"), map_function=f, map_category=map_category2value) if errors2 and options.loglevel >= 3: options.stdlog.write("# errors in input2: %s\n" % ";".join(map(str, errors2))) if options.loglevel >= 1: options.stdlog.write("# ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i\n" % (len(values1), len(errors1), len(values2), len(errors2))) if options.method in ("paired-mwu", "paired-t"): if len(values1) != len(values2): raise ValueError( "number of values must be equal for paired tests.") if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test(values1, values2, *xargs, **kwargs) elif options.method == "mwu": result = R.wilcox_test( values1, values2, paired=False, correct=True, *xargs, **kwargs) elif options.method == "paired-mwu": result = R.wilcox_test( values1, values2, paired=True, correct=True, *xargs, **kwargs) elif options.method == "paired-t": result = R.t_test(values1, values2, paired=True, *xargs, **kwargs) elif options.method == "shapiro": if len(values1) > 5000: E.warn( "shapiro-wilk test only accepts < 5000 values, a random sample has been created.") values1 = random.sample(values1, 5000) result = R.shapiro_test(values1, *xargs, **kwargs) if options.plot: R.assign("v1", values1) R.assign("v2", values2) if options.title: # set the size of the outer margins - the title needs to be added at the end # after plots have been created R.par(oma=R.c(0, 0, 4, 0)) R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True)) R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot") R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""") # compute breaks: min_value = min(min(values1), min(values2)) if options.min_value is not None: min_value = min(min_value, options.min_value) max_value = max(max(values1), max(values2)) if options.max_value is not None: max_value = max(max_value, options.max_value) extra_options = "" if options.num_bins and not (options.min_value or options.max_value): extra_options += ", breaks=%i" % options.num_bins elif options.num_bins and (options.min_value or options.max_value): bin_size = float((max_value - min_value)) / (options.num_bins + 1) breaks = [ min_value + x * bin_size for x in range(options.num_bins)] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) elif options.bin_size is not None: num_bins = int(((max_value - min_value) / options.bin_size)) + 1 breaks = [ min_value + x * options.bin_size for x in range(num_bins + 1)] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) R("""h1 <- hist( v1, freq=FALSE, density=20, main='Relative frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=FALSE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s)""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$density), max(h2$density)) / 2, c('%s'), fill=c('white','red'))""" % ( "','".join(options.legend))) R("""h1 <- hist( v1, freq=TRUE, density=20, main='Absolute frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=TRUE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s )""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$counts), max(h2$counts)) / 2, c('%s'), fill=c('white','red'))""" % ( "','".join(options.legend))) if options.title: R.mtext(options.title, 3, outer=True, line=1, cex=1.5) if options.loglevel >= 1: options.stdout.write("## Results for %s\n" % result['method']) options.stdout.write("%s\t%s\n" % ("key", options.header)) for key in list(result.keys()): if key == "data.name": continue options.stdout.write("\t".join((key, str(result[key]))) + "\n") stat = Stats.Summary(values1) for key, value in list(stat.items()): options.stdout.write("%s1\t%s\n" % (str(key), str(value))) stat = Stats.Summary(values2) for key, value in list(stat.items()): options.stdout.write("%s2\t%s\n" % (str(key), str(value))) if options.plot: if options.hardcopy: R.dev_off() E.Stop()
def quantileNormalize(self): ''' Description: Normalize between Wig class instances by Quantile Parameter: None Value: None ''' ss = time() self.ensureSameChrsByRemove() wigs = self.data r('require("preprocessCore")') normq = r('normalize.quantiles') chrs = {} #now it is a dictionary, but will be change to a list later for wig in wigs: for chr in wigs[wig].data: if chr in chrs: chrs[chr] += 1 else: chrs[chr] = 1 wnum = len(list(wigs.keys())) ''' pops=[] for chr in chrs: if chrs[chr]<wnum:pops.append(chr) for chr in pops:chrs.pop(chr) ''' chrs = list(chrs.keys()) #now chrs is a list names = list(wigs.keys()) num = len(names) sizes = {} size = 0 for chr in chrs: for name in names: if chr not in wigs[name].data: wigs[name].data[chr] = numpy.array([0.0]) if chr not in sizes: sizes[chr] = wigs[name].data[chr].size elif sizes[chr] < wigs[name].data[chr].size: sizes[chr] = wigs[name].data[chr].size size += sizes[chr] lst = numpy.array([0.0]) lst.resize(size * num, refcheck=0) for i in range(0, num): name = names[i] tsize = size * i for j in range(0, len(chrs)): chr = chrs[j] wigs[name].data[chr].resize(sizes[chr], refcheck=0) ttsize = tsize + sizes[chr] lst[tsize:ttsize] += wigs[name].data[chr][:sizes[chr]] tsize = ttsize mtr = r.matrix(FloatVector(lst), nrow=size, ncol=num) nmtr = normq(mtr) for i in range(0, num): name = names[i] tsize = size * i for j in range(0, len(chrs)): chr = chrs[j] ttsize = tsize + sizes[chr] wigs[name].data[chr][:sizes[chr]] = lst[tsize:ttsize] tsize = ttsize sys.stdout.write('time cost', str(time() - ss) + "\n") return 1
def facetedGGSeqLogo(logodata, chars, plotfile, width, height, ncol=None, char_colors=AA_COLORS_FG, xlabelsrotate=True): """Creates faceted logo plot. Designed to show several measurements on the same site site-by-side, potentially for many sites. Each site must have the same set of measurements. Makes panel of logo plots faceted on `logodata['facetlabel']`, where character stacks are labeled by `logodata['stacklabel']` and show the characters at the indicated heights. Args: `logodata` (pandas DataFrame) Contains data to plot. Should have the columns `facetlabel`, `stacklabel`, and a column giving the height of each character in `chars`. `chars` (list) Letters for which we plot heights. `plotfile` (str) Name of created plot. `width` (float) Width of plot in inches. `height` (float) Height of plot in inches. `ncol` (int or `None`) Number of columns in faceted plot. If `None`, use as many as needed to plot everything in one row. `char_colors` (dict) Values give color for every character in `chars`. `xlabelsrotate` (bool) Do we rotate the x-labels? Here is an example that creates two facets each with two stacks for the characters `A` and `C`: >>> logodata = pandas.read_csv(io.StringIO( ... '''facetlabel stacklabel A C ... site-1 BF520 0.8 0.2 ... site-1 BG505 0.9 0.1 ... site-2 BF520 0.4 0.6 ... site-2 BG505 0.5 0.5'''), ... delim_whitespace=True, index_col=False) >>> plotfile = '_facetedGGSeqLogo_test_plot.png' >>> facetedGGSeqLogo(logodata, ... chars=['A', 'C'], ... plotfile=plotfile, ... width=3, height=2.5 ... ) >>> os.path.isfile(plotfile) True Here is the plot created by the code block above: .. image:: _static/_facetedGGSeqLogo_test_plot.png :width: 40% :align: center """ if os.path.isfile(plotfile): os.remove(plotfile) assert set(chars) <= set(char_colors.keys()), \ "`char_colors` not defined for all chars" # get and order data columns df_cols = ['facetlabel', 'stacklabel'] + chars assert set(logodata.columns) >= set(df_cols), "df lacks required columns" logodata = logodata[df_cols] facets = logodata['facetlabel'].unique() stacks = logodata['stacklabel'].unique() if ncol is None: ncol = len(facets) # generate list of matrices to facet matrices = [] for f in facets: facetdata = (logodata.query('facetlabel == @f') .drop('facetlabel', axis=1) .set_index('stacklabel') .reindex(stacks) .fillna(0) ) m = r.matrix( facetdata.values.ravel(), ncol=len(stacks), dimnames=[chars, stacks] ) matrices.append(m) matrices = ListVector(TaggedList(matrices, tags=facets.astype('str'))) # make the plot with warnings.catch_warnings(): warnings.simplefilter(SHOW_WARNINGS) _RFUNCS.facetedGGSeqLogo( matrices=matrices, plotfile=plotfile, ncol=ncol, width=width, height=height, xname='', xlabels=stacks, xlabelsrotate=xlabelsrotate, xline=True, yname='', chars=StrVector(chars), char_colors=StrVector([char_colors[x] for x in chars]) ) if not os.path.isfile(plotfile): raise RuntimeError("failed to create {0}".format(plotfile))
def quantileNormalize(self): ''' Description: Normalize between Wig class instances by Quantile Parameter: None Value: None ''' ss=time() self.ensureSameChrsByRemove() wigs=self.data r('require("preprocessCore")') normq=r('normalize.quantiles') chrs={}#now it is a dictionary, but will be change to a list later for wig in wigs: for chr in wigs[wig].data: if chrs.has_key(chr):chrs[chr]+=1 else:chrs[chr]=1 wnum=len(wigs.keys()) ''' pops=[] for chr in chrs: if chrs[chr]<wnum:pops.append(chr) for chr in pops:chrs.pop(chr) ''' chrs=chrs.keys()#now chrs is a list names=wigs.keys() num=len(names) sizes={} size=0 for chr in chrs: for name in names: if not wigs[name].data.has_key(chr):wigs[name].data[chr]=numpy.array([0.0]) if not sizes.has_key(chr):sizes[chr]=wigs[name].data[chr].size elif sizes[chr]<wigs[name].data[chr].size:sizes[chr]=wigs[name].data[chr].size size+=sizes[chr] lst=numpy.array([0.0]) lst.resize(size*num,refcheck=0) for i in range(0,num): name = names[i] tsize=size*i for j in range(0,len(chrs)): chr = chrs[j] wigs[name].data[chr].resize(sizes[chr],refcheck=0) ttsize=tsize+sizes[chr] lst[tsize:ttsize]+=wigs[name].data[chr][:sizes[chr]] tsize=ttsize mtr=r.matrix(FloatVector(lst),nrow = size, ncol = num) nmtr=normq(mtr) for i in range(0,num): name=names[i] tsize=size*i for j in range(0,len(chrs)): chr = chrs[j] ttsize=tsize+sizes[chr] wigs[name].data[chr][:sizes[chr]]=lst[tsize:ttsize] tsize=ttsize print 'time cost',time()-ss return 1
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: r_compare_distributions.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option( "-m", "--method", dest="method", type="choice", help= "method to use: ks=Kolmogorov-Smirnov, mwu=Mann-WhitneyU, shapiro=Shapiro-Wilk, paired-mwu=paired Mann-WhitneyU, paired-t=paired t-test [default=%default]", choices=("ks", "mwu", "shapiro", "paired-mwu", "paired-t")) parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file.", metavar="FILE") parser.add_option("-1", "--infile1", dest="filename_input1", type="string", help="input filename for distribution 1.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename for distribution 2.") parser.add_option("--plot-legend", dest="legend", type="string", help="legend for histograms." "") parser.add_option("-f", "--infile-map", dest="filename_input_map", type="string", help="input filename for mapping categories to values.") parser.add_option( "-n", "--norm-test", dest="norm_test", action="store_true", help= """test if a set of values is normally distributed. Mean and variance are calculated from the data.""") parser.add_option("-b", "--num-bins", dest="num_bins", type="int", help="""number of bins (for plotting purposes only).""") parser.add_option("--bin-size", dest="bin_size", type="float", help="""bin size for plot.""") parser.add_option("--min-value", dest="min_value", type="float", help="""minimum_value for plot.""") parser.add_option("--max-value", dest="max_value", type="float", help="""maximum_value for plot.""") parser.add_option("--skip-plot", dest="plot", action="store_false", help="""skipping plotting.""") parser.add_option("--header-names", dest="header", type="string", help="""header of value column [default=%default].""") parser.add_option("--title", dest="title", type="string", help="""plot title [default=%default].""") parser.set_defaults( method="ks", filename_input1=None, filename_input2=None, filename_input_map=None, legend=None, norm_test=False, num_bins=0, legend_range="2,2", bin_size=None, min_value=None, plot=True, header="value", title=None, ) (options, args) = E.Start(parser, add_pipe_options=True) kwargs = {} xargs = [] for arg in args: if "=" in arg: key, value = arg.split("=") kwargs[key] = value else: xargs.append(arg) if options.legend: options.legend = options.legend.split(",") map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"), map_functions=(str, float)) f = str else: f = float if options.filename_input1: infile1 = IOTools.openFile(options.filename_input1, "r") else: infile1 = sys.stdin values1, errors1 = IOTools.ReadList(infile1, map_function=f, map_category=map_category2value) if options.filename_input1: infile1.close() if errors1 and options.loglevel >= 3: options.stdlog.write("# errors in input1: %s\n" % ";".join(map(str, errors1))) if options.norm_test: mean = R.mean(values1) stddev = R.sd(values1) options.stdlog.write( "# creating %i samples from normal distribution with mean %f and stddev %f\n" % (len(values1), mean, stddev)) values2 = R.rnorm(len(values1), mean, stddev) errors2 = () else: values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"), map_function=f, map_category=map_category2value) if errors2 and options.loglevel >= 3: options.stdlog.write("# errors in input2: %s\n" % ";".join(map(str, errors2))) if options.loglevel >= 1: options.stdlog.write( "# ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i\n" % (len(values1), len(errors1), len(values2), len(errors2))) if options.method in ("paired-mwu", "paired-t"): if len(values1) != len(values2): raise ValueError( "number of values must be equal for paired tests.") if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test(values1, values2, *xargs, **kwargs) elif options.method == "mwu": result = R.wilcox_test(values1, values2, paired=False, correct=True, *xargs, **kwargs) elif options.method == "paired-mwu": result = R.wilcox_test(values1, values2, paired=True, correct=True, *xargs, **kwargs) elif options.method == "paired-t": result = R.t_test(values1, values2, paired=True, *xargs, **kwargs) elif options.method == "shapiro": if len(values1) > 5000: E.warn( "shapiro-wilk test only accepts < 5000 values, a random sample has been created." ) values1 = random.sample(values1, 5000) result = R.shapiro_test(values1, *xargs, **kwargs) if options.plot: R.assign("v1", values1) R.assign("v2", values2) if options.title: # set the size of the outer margins - the title needs to be added at the end # after plots have been created R.par(oma=R.c(0, 0, 4, 0)) R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True)) R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot") R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""" ) # compute breaks: min_value = min(min(values1), min(values2)) if options.min_value is not None: min_value = min(min_value, options.min_value) max_value = max(max(values1), max(values2)) if options.max_value is not None: max_value = max(max_value, options.max_value) extra_options = "" if options.num_bins and not (options.min_value or options.max_value): extra_options += ", breaks=%i" % options.num_bins elif options.num_bins and (options.min_value or options.max_value): bin_size = float((max_value - min_value)) / (options.num_bins + 1) breaks = [ min_value + x * bin_size for x in range(options.num_bins) ] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) elif options.bin_size is not None: num_bins = int(((max_value - min_value) / options.bin_size)) + 1 breaks = [ min_value + x * options.bin_size for x in range(num_bins + 1) ] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) R("""h1 <- hist( v1, freq=FALSE, density=20, main='Relative frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=FALSE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s)""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$density), max(h2$density)) / 2, c('%s'), fill=c('white','red'))""" % ("','".join(options.legend))) R("""h1 <- hist( v1, freq=TRUE, density=20, main='Absolute frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=TRUE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s )""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$counts), max(h2$counts)) / 2, c('%s'), fill=c('white','red'))""" % ("','".join(options.legend))) if options.title: R.mtext(options.title, 3, outer=True, line=1, cex=1.5) if options.loglevel >= 1: options.stdout.write("## Results for %s\n" % result['method']) options.stdout.write("%s\t%s\n" % ("key", options.header)) for key in list(result.keys()): if key == "data.name": continue options.stdout.write("\t".join((key, str(result[key]))) + "\n") stat = Stats.Summary(values1) for key, value in list(stat.items()): options.stdout.write("%s1\t%s\n" % (str(key), str(value))) stat = Stats.Summary(values2) for key, value in list(stat.items()): options.stdout.write("%s2\t%s\n" % (str(key), str(value))) if options.plot: if options.hardcopy: R.dev_off() E.Stop()
def calc_kmeans_greedy(data, clusters, iterations): data_as_list = data.flatten() data_as_rmatrix = r.matrix(data_as_list, ncol = len(data[0]), byrow = True) cluster_indexes = array(r.kmeans(data_as_rmatrix, clusters, iter_max = iterations, nstart = 1, algorithm = "MacQueen")[0]) ordered_clusters = order_kcluster_contents(clusters, cluster_indexes, data) return order_clusters(ordered_clusters)
def buildUTRExtension(infile, outfile): '''build new utrs by building and fitting an HMM to reads upstream and downstream of known genes. Works on output of buildGeneLevelReadExtension. Known problems * the size of the extension is limited by the window size * introns within UTRs are ignored. * UTR extension might be underestimated for highly expressed genes as relative read counts drop off quickly, even though there is a good amount of reads still present in the UTR. The model The model is a three-state model:: UTR --|--> notUTR --|--> otherTranscript --| ^---| ^------| ^-------| ^-----------------------------| The chain starts in UTR and ends in notUTr or otherTranscript. The otherTranscript state models peaks of within the upstream/ downstream region of a gene. These peaks might correspond to additional exons or unknown transcripts. Without this state, the UTR might be artificially extend to include these peaks. Emissions are modelled with beta distributions. These distributions permit both bimodal (UTR) and unimodal (notUTR) distribution of counts. Parameter estimation Parameters are derived from known UTRs within full length territories. Transitions and emissions for the otherTranscript state are set heuristically: * low probabibily for remaining in state "otherTranscript". * these transcripts should be short. * emissions biased towards high counts - only strong signals will be considered. * these could be estimated from known UTRs, but I am worried UTR extensions then will be diluted. Alternatives The method could be improved. * base level resolution? * longer chains result in more data and longer running times. * the averaging in windows smoothes the data, which might have a beneficial effect. * raw counts instead of scaled counts? * better model, as highly expressed genes should give more confident predictions. ''' # the bin size , see gtf2table - can be cleaned from column names # or better set as options in .ini file binsize = 100 territory_size = 15000 # read gene coordinates geneinfos = {} for x in CSV.DictReader(IOTools.openFile(infile), dialect='excel-tab'): contig, strand, start, end = x['contig'], x['strand'], int( x['start']), int(x['end']) geneinfos[x['gene_id']] = (contig, strand, start, end) infiles = [ infile + ".readextension_upstream_sense.tsv.gz", infile + ".readextension_downstream_sense.tsv.gz" ] outdir = os.path.join(PARAMS["exportdir"], "utr_extension") R('''suppressMessages(library(RColorBrewer))''') R('''suppressMessages(library(MASS))''') R('''suppressMessages(library(HiddenMarkov))''') # for upstream, downstream upstream_utrs, downstream_utrs = {}, {} all_genes = set() for filename, new_utrs in zip(infiles, (upstream_utrs, downstream_utrs)): E.info("processing %s" % filename) parts = os.path.basename(filename).split(".") data = R( '''data = read.table( gzfile( "%(filename)s"), header=TRUE, fill=TRUE, row.names=1)''' % locals()) ########################################## ########################################## ########################################## ## estimation ########################################## # take only those with a 'complete' territory R('''d = data[-which( apply( data,1,function(x)any(is.na(x)))),]''') # save UTR R('''utrs = d$utr''') # remove length and utr column R('''d = d[-c(1,2)]''') # remove those which are completely empty, logtransform or scale data and export R('''lraw = log10( d[-which( apply(d,1,function(x)all(x==0))),] + 1 )''' ) utrs = R('''utrs = utrs[-which( apply(d,1,function(x)all(x==0)))]''') scaled = R( '''lscaled = t(scale(t(lraw), center=FALSE, scale=apply(lraw,1,max) ))''' ) exons = R('''lraw[,1]''') ####################################################### ####################################################### ####################################################### # do the estimation: E.debug("estimation: utrs=%i, exons=%i, vals=%i, dim=%s" % (len(utrs), len(exons), len(scaled), R.dim(scaled))) # counts within and outside UTRs within_utr, outside_utr, otherTranscript = [], [], [] # number of transitions between utrs transitions = numpy.zeros((3, 3), numpy.int) for x in xrange(len(utrs)): utr, exon = utrs[x], exons[x] # only consider genes with expression coverage # note: expression level is logscaled here, 10^1 = 10 if exon < 0.1: continue # first row is column names, so x + 1 values = list(scaled.rx(x + 1, True)) utr_bins = utr // binsize nonutr_bins = (territory_size - utr) // binsize # build transition matrix transitions[0][0] += utr_bins transitions[0][1] += 1 transitions[1][1] += nonutr_bins outside_utr.extend([x for x in values[utr_bins:] if x <= 0.5]) # ignore exon and zero counts within_utr.extend([x for x in values[1:utr_bins] if x > 0.1]) # add only high counts to otherTranscript emissions otherTranscript.extend([x for x in values[utr_bins:] if x > 0.5]) # estimation for # 5% chance of transiting to otherTranscript transitions[1][2] = transitions[1][1] * 0.05 # 10% chance of remaining in otherTranscript transitions[2][1] = 900 transitions[2][2] = 100 E.info( "counting: (n,mean): within utr=%i,%f, outside utr=%i,%f, otherTranscript=%i,%f" % \ ( len(within_utr), numpy.mean(within_utr), len(outside_utr), numpy.mean(outside_utr), len(otherTranscript), numpy.mean(otherTranscript)) ) ro.globalenv['transitions'] = R.matrix(transitions, nrow=3, ncol=3) R('''transitions = transitions / rowSums( transitions )''') ro.globalenv['within_utr'] = ro.FloatVector(within_utr[:10000]) ro.globalenv['outside_utr'] = ro.FloatVector(outside_utr[:10000]) ro.globalenv['otherTranscript'] = ro.FloatVector( otherTranscript[:10000]) # estimate beta distribution parameters R('''doFit = function( data ) { data[data == 0] = data[data == 0] + 0.001 data[data == 1] = data[data == 1] - 0.001 f = fitdistr( data, dbeta, list( shape1=0.5, shape2=0.5 ) ) return (f) }''') fit_within_utr = R( '''fit_within_utr = suppressMessages(doFit( within_utr))''') fit_outside_utr = R( '''fit_outside_utr = suppressMessages(doFit( outside_utr))''') fit_other = R( '''fit_otherTranscript = suppressMessages(doFit( otherTranscript))''' ) within_a, within_b = list(fit_within_utr.rx("estimate"))[0] outside_a, outside_b = list(fit_outside_utr.rx("estimate"))[0] other_a, other_b = list(fit_other.rx("estimate"))[0] E.info( "beta estimates: within_utr=%f,%f outside=%f,%f, other=%f,%f" % \ (within_a, within_b, outside_a, outside_b, other_a, other_b)) fn = ".".join((parts[0], parts[4], "fit", "png")) outfilename = os.path.join(outdir, fn) R.png(outfilename, height=1000, width=1000) R('''par(mfrow=c(3,1))''') R('''x=seq(0,1,0.02)''') R('''hist( within_utr, 50, col=rgb( 0,0,1,0.2) )''') R('''par(new=TRUE)''') R('''plot( x, dbeta( x, fit_within_utr$estimate['shape1'], fit_within_utr$estimate['shape2']), type='l', col='blue')''' ) R('''hist( outside_utr, 50, col=rgb( 1,0,0,0.2 ) )''') R('''par(new=TRUE)''') R('''plot( x, dbeta( x, fit_outside_utr$estimate['shape1'], fit_outside_utr$estimate['shape2']), type='l', col='red')''' ) R('''hist( otherTranscript, 50, col=rgb( 0,1,0,0.2 ) )''') R('''par(new=TRUE)''') R('''plot( x, dbeta( x, fit_otherTranscript$estimate['shape1'], fit_otherTranscript$estimate['shape2']), type='l', col='green')''' ) R['dev.off']() ##################################################### ##################################################### ##################################################### # build hmm # state 1 = UTR # state 2 = notUTR # state 3 = other transcript p = R('''betaparams = list( shape1=c(fit_within_utr$estimate['shape1'], fit_outside_utr$estimate['shape1'], fit_otherTranscript$estimate['shape1']), shape2=c(fit_within_utr$estimate['shape2'], fit_outside_utr$estimate['shape2'], fit_otherTranscript$estimate['shape2'])) ''' ) R('''hmm = dthmm(NULL, transitions, c(1,0,0), "beta", betaparams )''') E.info("fitting starts") ##################################################### ##################################################### ##################################################### # fit to every sequence genes = R('''rownames(data)''') all_genes.update(set(genes)) utrs = R('''data$utr''') exons = R('''data$exon''') nseqs = len(utrs) counter = E.Counter() for idx in xrange(len(utrs)): gene_id = genes[idx] old_utr = utrs[idx] if idx % 100 == 0: E.debug("processing gene %i/%i" % (idx, len(utrs))) counter.input += 1 # do not predict if terminal exon not expressed if exons[idx] < 1: counter.skipped_notexpressed += 1 new_utrs[gene_id] = Utr._make( (old_utr, None, None, "notexpressed")) continue R('''obs = data[%i,][-c(1,2)]''' % (idx + 1)) # remove na obs = R('''obs = obs[!is.na(obs)]''') if len(obs) <= 1 or max(obs) == 0: new_utrs[gene_id] = Utr._make( (old_utr, None, None, "no observations")) continue # normalize R('''obs = obs / max(obs)''') # add small epsilon to 0 and 1 values R('''obs[obs==0] = obs[obs==0] + 0.001 ''') R('''obs[obs==1] = obs[obs==1] - 0.001 ''') R('''hmm$x = obs''') states = None try: states = list(R('''states = Viterbi( hmm )''')) except ri.RRuntimeError, msg: counter.skipped_error += 1 new_utrs[gene_id] = Utr._make((old_utr, None, None, "fail")) continue max_utr = binsize * (len(states) - 1) # subtract 1 for last exon try: new_utr = binsize * (states.index(2) - 1) new_utrs[gene_id] = Utr._make( (old_utr, new_utr, max_utr, "ok")) counter.success += 1 except ValueError: new_utrs[gene_id] = Utr._make( (old_utr, max_utr, max_utr, "max")) counter.maxutr += 1
def buildUTRExtension(infile, outfile): '''build new utrs by building and fitting an HMM to reads upstream and downstream of known genes. Works on output of buildGeneLevelReadExtension. Known problems * the size of the extension is limited by the window size * introns within UTRs are ignored. * UTR extension might be underestimated for highly expressed genes as relative read counts drop off quickly, even though there is a good amount of reads still present in the UTR. The model The model is a three-state model:: UTR --|--> notUTR --|--> otherTranscript --| ^---| ^------| ^-------| ^-----------------------------| The chain starts in UTR and ends in notUTr or otherTranscript. The otherTranscript state models peaks of within the upstream/ downstream region of a gene. These peaks might correspond to additional exons or unknown transcripts. Without this state, the UTR might be artificially extend to include these peaks. Emissions are modelled with beta distributions. These distributions permit both bimodal (UTR) and unimodal (notUTR) distribution of counts. Parameter estimation Parameters are derived from known UTRs within full length territories. Transitions and emissions for the otherTranscript state are set heuristically: * low probabibily for remaining in state "otherTranscript". * these transcripts should be short. * emissions biased towards high counts - only strong signals will be considered. * these could be estimated from known UTRs, but I am worried UTR extensions then will be diluted. Alternatives The method could be improved. * base level resolution? * longer chains result in more data and longer running times. * the averaging in windows smoothes the data, which might have a beneficial effect. * raw counts instead of scaled counts? * better model, as highly expressed genes should give more confident predictions. ''' # the bin size , see gtf2table - can be cleaned from column names # or better set as options in .ini file binsize = 100 territory_size = 15000 # read gene coordinates geneinfos = {} for x in CSV.DictReader(IOTools.openFile(infile), dialect='excel-tab'): contig, strand, start, end = x['contig'], x[ 'strand'], int(x['start']), int(x['end']) geneinfos[x['gene_id']] = (contig, strand, start, end) infiles = [infile + ".readextension_upstream_sense.tsv.gz", infile + ".readextension_downstream_sense.tsv.gz"] outdir = os.path.join(PARAMS["exportdir"], "utr_extension") R('''suppressMessages(library(RColorBrewer))''') R('''suppressMessages(library(MASS))''') R('''suppressMessages(library(HiddenMarkov))''') # for upstream, downstream upstream_utrs, downstream_utrs = {}, {} all_genes = set() for filename, new_utrs in zip(infiles, (upstream_utrs, downstream_utrs)): E.info("processing %s" % filename) parts = os.path.basename(filename).split(".") data = R( '''data = read.table( gzfile( "%(filename)s"), header=TRUE, fill=TRUE, row.names=1)''' % locals() ) ########################################## ########################################## ########################################## # estimation ########################################## # take only those with a 'complete' territory R('''d = data[-which( apply( data,1,function(x)any(is.na(x)))),]''') # save UTR R('''utrs = d$utr''' ) # remove length and utr column R('''d = d[-c(1,2)]''') # remove those which are completely empty, logtransform or scale data # and export R('''lraw = log10( d[-which( apply(d,1,function(x)all(x==0))),] + 1 )''') utrs = R('''utrs = utrs[-which( apply(d,1,function(x)all(x==0)))]''' ) scaled = R( '''lscaled = t(scale(t(lraw), center=FALSE, scale=apply(lraw,1,max) ))''' ) exons = R('''lraw[,1]''') ####################################################### ####################################################### ####################################################### # do the estimation: E.debug("estimation: utrs=%i, exons=%i, vals=%i, dim=%s" % (len(utrs), len(exons), len(scaled), R.dim(scaled))) # counts within and outside UTRs within_utr, outside_utr, otherTranscript = [], [], [] # number of transitions between utrs transitions = numpy.zeros((3, 3), numpy.int) for x in xrange(len(utrs)): utr, exon = utrs[x], exons[x] # only consider genes with expression coverage # note: expression level is logscaled here, 10^1 = 10 if exon < 0.1: continue # first row is column names, so x + 1 values = list(scaled.rx(x + 1, True)) utr_bins = utr // binsize nonutr_bins = (territory_size - utr) // binsize # build transition matrix transitions[0][0] += utr_bins transitions[0][1] += 1 transitions[1][1] += nonutr_bins outside_utr.extend([x for x in values[utr_bins:] if x <= 0.5]) # ignore exon and zero counts within_utr.extend([x for x in values[1:utr_bins] if x > 0.1]) # add only high counts to otherTranscript emissions otherTranscript.extend([x for x in values[utr_bins:] if x > 0.5]) # estimation for # 5% chance of transiting to otherTranscript transitions[1][2] = transitions[1][1] * 0.05 # 10% chance of remaining in otherTranscript transitions[2][1] = 900 transitions[2][2] = 100 E.info("counting: (n,mean): within utr=%i,%f, outside utr=%i,%f, otherTranscript=%i,%f" % (len(within_utr), numpy.mean(within_utr), len(outside_utr), numpy.mean(outside_utr), len(otherTranscript), numpy.mean(otherTranscript))) ro.globalenv['transitions'] = R.matrix(transitions, nrow=3, ncol=3) R('''transitions = transitions / rowSums( transitions )''') ro.globalenv['within_utr'] = ro.FloatVector(within_utr[:10000]) ro.globalenv['outside_utr'] = ro.FloatVector(outside_utr[:10000]) ro.globalenv['otherTranscript'] = ro.FloatVector( otherTranscript[:10000]) # estimate beta distribution parameters R('''doFit = function( data ) { data[data == 0] = data[data == 0] + 0.001 data[data == 1] = data[data == 1] - 0.001 f = fitdistr( data, dbeta, list( shape1=0.5, shape2=0.5 ) ) return (f) }''' ) fit_within_utr = R( '''fit_within_utr = suppressMessages(doFit( within_utr))''' ) fit_outside_utr = R( '''fit_outside_utr = suppressMessages(doFit( outside_utr))''' ) fit_other = R( '''fit_otherTranscript = suppressMessages(doFit( otherTranscript))''' ) within_a, within_b = list(fit_within_utr.rx("estimate"))[0] outside_a, outside_b = list(fit_outside_utr.rx("estimate"))[0] other_a, other_b = list(fit_other.rx("estimate"))[0] E.info("beta estimates: within_utr=%f,%f outside=%f,%f, other=%f,%f" % (within_a, within_b, outside_a, outside_b, other_a, other_b)) fn = ".".join((parts[0], parts[4], "fit", "png")) outfilename = os.path.join(outdir, fn) R.png(outfilename, height=1000, width=1000) R( '''par(mfrow=c(3,1))''' ) R( '''x=seq(0,1,0.02)''') R( '''hist( within_utr, 50, col=rgb( 0,0,1,0.2) )''' ) R( '''par(new=TRUE)''') R( '''plot( x, dbeta( x, fit_within_utr$estimate['shape1'], fit_within_utr$estimate['shape2']), type='l', col='blue')''') R( '''hist( outside_utr, 50, col=rgb( 1,0,0,0.2 ) )''' ) R( '''par(new=TRUE)''') R( '''plot( x, dbeta( x, fit_outside_utr$estimate['shape1'], fit_outside_utr$estimate['shape2']), type='l', col='red')''') R( '''hist( otherTranscript, 50, col=rgb( 0,1,0,0.2 ) )''' ) R( '''par(new=TRUE)''') R( '''plot( x, dbeta( x, fit_otherTranscript$estimate['shape1'], fit_otherTranscript$estimate['shape2']), type='l', col='green')''') R['dev.off']() ##################################################### ##################################################### ##################################################### # build hmm # state 1 = UTR # state 2 = notUTR # state 3 = other transcript p = R('''betaparams = list( shape1=c(fit_within_utr$estimate['shape1'], fit_outside_utr$estimate['shape1'], fit_otherTranscript$estimate['shape1']), shape2=c(fit_within_utr$estimate['shape2'], fit_outside_utr$estimate['shape2'], fit_otherTranscript$estimate['shape2'])) ''') R('''hmm = dthmm(NULL, transitions, c(1,0,0), "beta", betaparams )''' ) E.info("fitting starts") ##################################################### ##################################################### ##################################################### # fit to every sequence genes = R('''rownames(data)''') all_genes.update(set(genes)) utrs = R('''data$utr''') exons = R('''data$exon''') nseqs = len(utrs) counter = E.Counter() for idx in xrange(len(utrs)): gene_id = genes[idx] old_utr = utrs[idx] if idx % 100 == 0: E.debug("processing gene %i/%i" % (idx, len(utrs))) counter.input += 1 # do not predict if terminal exon not expressed if exons[idx] < 1: counter.skipped_notexpressed += 1 new_utrs[gene_id] = Utr._make( (old_utr, None, None, "notexpressed")) continue R('''obs = data[%i,][-c(1,2)]''' % (idx + 1) ) # remove na obs = R('''obs = obs[!is.na(obs)]''' ) if len(obs) <= 1 or max(obs) == 0: new_utrs[gene_id] = Utr._make( (old_utr, None, None, "no observations")) continue # normalize R('''obs = obs / max(obs)''') # add small epsilon to 0 and 1 values R('''obs[obs==0] = obs[obs==0] + 0.001 ''') R('''obs[obs==1] = obs[obs==1] - 0.001 ''') R('''hmm$x = obs''') states = None try: states = list(R('''states = Viterbi( hmm )''')) except ri.RRuntimeError, msg: counter.skipped_error += 1 new_utrs[gene_id] = Utr._make((old_utr, None, None, "fail")) continue max_utr = binsize * (len(states) - 1) # subtract 1 for last exon try: new_utr = binsize * (states.index(2) - 1) new_utrs[gene_id] = Utr._make( (old_utr, new_utr, max_utr, "ok")) counter.success += 1 except ValueError: new_utrs[gene_id] = Utr._make( (old_utr, max_utr, max_utr, "max")) counter.maxutr += 1
def r_matrix(x, rows): # Create R matrix from Python Array type. m = r.matrix(ro.FloatVector(x), nrow=rows) return m
from rpy2.robjects import r import numpy as np from rpy2.robjects import numpy2ri numpy2ri.activate() x = r.matrix(np.array(range(9)), nrow=3, ncol=3) r.assign('x', x) r('print(x)')
def exprs(self, array): exprs_set = r('`exprs<-`') mat = r.matrix(array, nrow=array.shape[0], ncol=array.shape[1]) exprs_set(self.ExpressionSet, mat)
def run(args): """ Main function :param args: args from the command line """ # argument reading # index of starting task nstart = int(args.start_gene_index) # index of ending task nend = int(args.end_gene_index) # get current dir cur_dir = os.getcwd() # single mask dir single_mask_dir = args.input_folder # info file info_file = args.gene_info # database dir db_dir = args.weight_db # covariance dir cov_dir = args.cov_dir # output dir out_dir = args.output_dir # read list of genes gene_info = pd.read_table(info_file) # output name output_name = args.output_name # r interface r_requirement() rpy2.robjects.numpy2ri.activate() importr("GBJ") P = nend - nstart + 1 gene_ensg = gene_info["gene_ensg"].copy() gene_id = gene_info["gene_ensg"].copy() gene_name = gene_info["gene_ensg"].copy() # read z-score file logging.info("Read in z-score files") # directory of z-score os.chdir(single_mask_dir) # search for files ending with .csv fi = [] fi_sqtl = [] for file in sorted(os.listdir("./")): if file.endswith(".csv"): fi.append(file) if file.endswith("_sqtl.csv"): fi_sqtl.append(file) logging.info(str(len(fi)) + " files in total.") N = len(fi) # index of sqtl results (3 tissues) indi2 = match_list(fi_sqtl, fi) # index of eqtl results (47 tissues) indi1 = np.delete(np.arange(0, N), indi2) zscore_dict = {} for i in range(N): nam = "zscore_" + str(i + 1) zscore_dict[nam] = pd.read_csv(fi[i], header="infer") # output file: list of test score and p-value logging.info("compute p-value for genes") #directory of db os.chdir(db_dir) # initialize the outcome matrix outcome = pd.DataFrame(np.zeros(shape=(P, N + 5))) outcome.iloc[:, :] = np.nan outcome.loc[:, 0] = gene_id[(nstart - 1):nend].values outcome.loc[:, 1] = gene_name[(nstart - 1):nend].values outcome = outcome.rename(columns={0: "gene_id", 1: "gene_name"}) # read the database fi = [] for file in sorted(os.listdir(db_dir)): if file.endswith(".db"): fi.append(file) # calculation for k in range(P): logging.info("Gene: " + str(k + nstart)) gene = gene_ensg[k + nstart - 1] print(gene) #read snp list #snp_rsid try: filename = cov_dir + "/" + gene + ".snplist" snp_rsid = pd.read_table(filename, header=None) except: continue snp_rsid = list(snp_rsid.loc[:, 0]) # matrix of weights # number of snps M = len(snp_rsid) logging.info("Number of SNPs: " + str(M)) # weights1: matrix of eqtl tissues weights1 = np.zeros(shape=(M, len(indi1))) for i in range(len(indi1)): #logging.info("Database: " + str(i+1)) dbname = fi[indi1[i]] conn = create_connection(dbname) cur = conn.cursor() sql_q = 'select * from weights where gene = "' + gene + '"' tmp_query = cur.execute(sql_q).fetchall() rsid_in_db = list(map(lambda x: str(x[0]), tmp_query)) #rsid_in_db = map(lambda x: str(x[0]), tmp_query) index = match_list(rsid_in_db, snp_rsid) indi = index[index > -1] # extract the weight tmp_weights = np.array(list(map(lambda x: str(x[2]), tmp_query))) #tmp_weights = np.array(map(lambda x: str(x[2]), tmp_query)) if sum(index > -1) > 0: weights1[indi, i] = tmp_weights[index > -1] # weights2: matrix of sqtl tissues (each intron is regarded as a separate tissue) weights2 = np.empty((M, 0)) intron_name = {} for i in range(len(indi2)): #logging.info("Database: " + str(i+1)) dbname = fi[indi2[i]] conn = create_connection(dbname) cur = conn.cursor() sql_q = "select * from weights where gene LIKE '" + gene + "!_%'" + " ESCAPE '!'" tmp_query = cur.execute(sql_q).fetchall() tmp_intron_name = list(map(lambda x: str(x[1]), tmp_query)) #tmp_intron_name = map(lambda x: str(x[1]), tmp_query) intron_name[i] = np.unique(tmp_intron_name) L = len(intron_name[i]) weights = np.zeros(shape=(M, L)) if L > 0: for j in range(L): sql_q = 'select * from weights where gene = "' + intron_name[ i][j] + '"' tmp_query = cur.execute(sql_q).fetchall() # extract the rsid for certain intron rsid_in_db = list(map(lambda x: str(x[0]), tmp_query)) #rsid_in_db = map(lambda x: str(x[0]), tmp_query) index = match_list(rsid_in_db, snp_rsid) indi = index[index > -1] tmp_weights = np.array( list(map(lambda x: str(x[2]), tmp_query))) # extract the weight if sum(index > -1) > 0: weights[indi, j] = tmp_weights[index > -1] weights2 = np.hstack((weights2, weights)) weights_f = np.hstack((weights1, weights2)) # covariance matrix of snps cov_file = cov_dir + "/" + gene_id[k + nstart - 1] + ".cov" cov_matrix = np.loadtxt(cov_file) # covariance matrix of gene in different tissue cov_gene = np.mat(weights_f.T) * np.mat(cov_matrix) * np.mat(weights_f) cov_gene = np.array(cov_gene) # normalization ncol = cov_gene.shape[1] for i in range(ncol): if cov_gene[i, i] != 0: cov_gene[i, :] = cov_gene[i, :] / np.sqrt(cov_gene[i, i]) cov_gene[:, i] = cov_gene[:, i] / cov_gene[i, i] ## zscore_gene1: z-score of eqtl tissues zscore_gene1 = np.empty(len(indi1)) for i in range(len(indi1)): nam = "zscore_" + str(indi1[i] + 1) index = zscore_dict[nam]["gene"] == gene if sum(index) > 0: zscore_gene1[i] = zscore_dict[nam]["zscore"][index].values[0] #p-value outcome.loc[k, (i + 5)] = float( zscore_dict[nam]["pvalue"][index].values[0]) else: zscore_gene1[i] = np.nan ## zscore_gene2: z-score of sqtl tissues (each matched intron has a z-score including NA) zscore_gene2 = np.array([]) pvalue_gene2 = np.array([]) for i in range(len(indi2)): intron = intron_name[i] nam = "zscore_" + str(indi2[i] + 1) if len(intron) != 0: for j in range(len(intron)): index = zscore_dict[nam]["gene"] == intron[j] if sum(index) > 0: tmp_zscore_gene = zscore_dict[nam]["zscore"][ index].values[0] tmp_pvalue_gene = zscore_dict[nam]["pvalue"][ index].values[0] else: tmp_zscore_gene = np.nan tmp_pvalue_gene = np.nan zscore_gene2 = np.append(zscore_gene2, tmp_zscore_gene) pvalue_gene2 = np.append(pvalue_gene2, tmp_pvalue_gene) ##matrix of zscores for all eqtl and sqtl tissues (the same dimension with cov_gene matrix) zscore_gene = np.concatenate((zscore_gene1, zscore_gene2)) #only keep tissues with prediction model for gene index = np.isnan(zscore_gene) == False if sum(index) > 1: zscore_gene = zscore_gene[index] cov_gene = cov_gene[index, :][:, index] elif sum(index) == 1: _tmp_index = np.argmax(np.isnan(zscore_gene) == False) _tmp_zscore = zscore_gene[_tmp_index] if _tmp_index < len(indi1): _tmp_pvalue = outcome.loc[k, _tmp_index + 5] else: _tmp_pvalue = pvalue_gene2[_tmp_index - len(indi1)] outcome.loc[k, 2] = _tmp_zscore outcome.loc[k, 3] = _tmp_pvalue continue else: # test cannot be done continue # check if the matrix is symmetric r_issymmetric = r['isSymmetric'] r_cov_gene = r.matrix(cov_gene, nrow=cov_gene.shape[0]) if r_issymmetric(r_cov_gene)[0]: # GBJ # convert the python object to r object r_zscore_gene = r.matrix(zscore_gene) # run the test GBJ_res = r["GBJ"](test_stats=r_zscore_gene, cor_mat=r_cov_gene) # output the test result to the result matrix outcome.loc[k, 2] = GBJ_res.rx2("GBJ")[0] print(GBJ_res.rx2("GBJ")[0]) outcome.loc[k, 3] = GBJ_res.rx2("GBJ_pvalue")[0] print(GBJ_res.rx2("GBJ_pvalue")[0]) # output the results os.chdir(out_dir) output_df = outcome.iloc[:, 1:4] filename = output_name + "_" + str(nstart) + "_" + str(nend) + ".txt" output_df.to_csv(filename, na_rep='NA', header=["gene", "test_score", "p_value"], index=None, sep='\t', mode='w')
def run(args): """ Main function :param args: args from the command line """ # argument reading # index of starting task nstart = int(args.start_gene_index) # index of ending task nend = int(args.end_gene_index) # single mask dir single_mask_dir = args.input_folder # info file info_file = args.gene_info # database dir db_dir = args.weight_db # covariance dir cov_dir = args.cov_dir # output dir out_dir = args.output_dir # read list of genes gene_info = pd.read_table(info_file) # r interface r_requirement() rpy2.robjects.numpy2ri.activate() importr("GBJ") P = nend - nstart + 1 gene_ensg = gene_info["gene_ensg"].copy() gene_id = gene_info["gene_ensg"].copy() gene_name = gene_info["gene_ensg"].copy() #read z-score file logging.info("Read in z-score files") #directory of z-score os.chdir(single_mask_dir) # search for files ending with .csv fi = [] for file in sorted(os.listdir(single_mask_dir)): if file.endswith(".csv"): fi.append(file) logging.info(str(len(fi)) + " files in total.") N = len(fi) zscore_dict = {} for i in range(N): nam = "zscore_" + str(i+1) zscore_dict[nam] = pd.read_csv(fi[i], header = "infer") #====== #output file: list of test score and p-value logging.info("compute p-value for genes") #directory of db os.chdir(db_dir) # initialize the outcome matrix outcome = pd.DataFrame(np.zeros(shape =(P,48))) outcome.loc[:,0] = gene_id[(nstart-1):nend] outcome.loc[:,1] = gene_name[(nstart-1):nend] outcome = outcome.rename(columns={0:"gene_id",1:"gene_name"}) # read the database fi = [] for file in sorted(os.listdir(db_dir)): if file.endswith(".db"): fi.append(file) # calculation for k in range(P): logging.info("Gene: " + str(k + nstart)) gene = gene_ensg[k + nstart -1] print(gene) #read snp list #snp_rsid try: filename = cov_dir + gene + ".snplist" snp_rsid = pd.read_table(filename, header = None) except: continue snp_rsid = list(snp_rsid.loc[:,0]) #matrix of weights M = len(snp_rsid) #number of snps logging.info("Number of SNPs: " + str(M)) weights = np.zeros(shape = (M, N)) for i in range(N): #logging.info("Database: " + str(i+1)) dbname = fi[i] conn = create_connection(dbname) cur = conn.cursor() sql_q = 'select * from weights where gene = "' + gene + '"' tmp_query = cur.execute(sql_q).fetchall() rsid_in_db = list(map(lambda x: str(x[0]), tmp_query)) #rsid_in_db = map(lambda x: str(x[0]), tmp_query) index = match_list(rsid_in_db, snp_rsid) indi = index[index > -1] #print(index) # extract the weight sql_q = 'select * from weights where gene = "' + gene + '"' tmp_query = cur.execute(sql_q).fetchall() tmp_weights = np.array(list(map(lambda x: str(x[2]), tmp_query))) #tmp_weights = np.array(map(lambda x: str(x[2]), tmp_query)) if sum(index) > 0: weights[indi,i] = tmp_weights[index > -1] # covariance matrix of snps cov_file = cov_dir + gene_id[k + nstart - 1] + ".cov" cov_matrix = np.loadtxt(cov_file) # covariance matrix of gene in different tissue cov_gene = np.mat(weights.T) * np.mat(cov_matrix) * np.mat(weights) cov_gene = np.array(cov_gene) # normalization for i in range(N): if cov_gene[i,i] != 0: cov_gene[i,:] = cov_gene[i,:] / np.sqrt(cov_gene[i,i]) cov_gene[:,i] = cov_gene[:,i] / cov_gene[i,i] #z-score of gene in different tissue zscore_gene = np.full([N, 1], np.nan) for i in range(N): nam = "zscore_" + str(i+1) index = zscore_dict[nam]["gene"] == gene if sum(index) > 0: zscore_gene[i] = zscore_dict[nam]["zscore"][index].values[0] #p-value outcome.loc[k, (i+4)] = float(zscore_dict[nam]["pvalue"][index].values[0]) #only keep tissues with prediction model for gene index = np.isnan(zscore_gene) == False indext = index.T[0] if sum(index) > 0: zscore_gene = zscore_gene[index] cov_gene = cov_gene[indext,:][:,indext] else: # test cannot be done continue # check if the matrix is symmetric if np.allclose(cov_gene,cov_gene.T): # GBJ # convert the python object to r object r_zscore_gene = r.matrix(zscore_gene) r_cov_gene = r.matrix(cov_gene, nrow = cov_gene.shape[0]) # run the test GBJ_res = r["GBJ"](test_stats=r_zscore_gene, cor_mat=r_cov_gene) # output the test result to the result matrix outcome.loc[k, 2] = GBJ_res.rx2("GBJ")[0] print(GBJ_res.rx2("GBJ")[0]) outcome.loc[k, 3] = GBJ_res.rx2("GBJ_pvalue")[0] print(GBJ_res.rx2("GBJ_pvalue")[0]) # output the results os.chdir(out_dir) filename = "outcome_" + str(nstart) + "_" + str(nend) + ".txt" outcome.to_csv(filename, header=None, index=None, sep='\t', mode='w')
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: r_mann_whitney_u.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-m", "--method", dest="method", type="string", help="method to use [ks=Kolmogorov-Smirnov,mwu=Mann-WhitneyU]") parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file.", metavar="FILE") parser.add_option("-1", "--infile1", dest="filename_input1", type="string", help="input filename for distribution 1.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename for distribution 2.") parser.add_option("-p", "--infile-map", dest="filename_input_map", type="string", help="input filename for mapping categories to values.") parser.set_defaults( method="ks", filename_input1=None, filename_input2=None, filename_input_map=None, ) (options, args) = E.start(parser, add_pipe_options=True) map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"), map_functions=(str, float)) values1, errors1 = IOTools.ReadList(open(options.filename_input1, "r"), map_category=map_category2value) values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"), map_category=map_category2value) E.info("ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i" % (len(values1), len(errors1), len(values2), len(errors2))) if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test(values1, values2) elif options.method == "mwu": result = R.wilcox_test(values1, values2, paired=False) R.assign("v1", values1) R.assign("v2", values2) R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True)) R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot") R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""") R("""hist( v1, freq=FALSE, width=0.5, density=10, main='Relative frequency histogram')""") R("""hist( v2, freq=FALSE, add=TRUE, width=0.5, col='red', offset=0.5, density=20, angle=135)""") R("""hist( v1, freq=TRUE, width=0.5, density=10, main='Absolute frequency histogram')""") R("""hist( v2, freq=TRUE, add=TRUE, width=0.5, col='red', offset=0.5, density=20, angle=135)""") print("## Results for %s" % result['method']) for x in ['p.value', 'statistic', 'alternative', 'method']: print(x, result[x]) E.stop()
def siteSubsetGGSeqLogo(logodata, chars, plotfile, width, height, yname='', char_colors=AA_COLORS_FG, ylimits=None): """Creates one-row logo plot with subset of sites. Designed to show logo plot for a subset of sites. This is useful when you have data for many sites, but only want to look at a few of them. Args: `logodata` (pandas DataFrame) Contains data to plot. Should have the columns `site`, `show`, and a column giving the height height of each char in `chars`. Only sites where `show` is `True` are shown. Sites are shown in the order they occur in this dataframe, with spaces every time there is an interspersed site with `show` being `False`. `chars` (list) Letters for which we plot heights. `plotfile` (str) Name of created plot. `width` (float) Width of plot in inches. `height` (float) Height of plot in inches. `yname` (str) If set to a non-empty string, is the y-axis label and yticks are drawn. `char_colors` (dict) Values give color for every character in `chars`. `ylimits` (`None` or 2-tuple) If not `None`, should give the ylimits for the plot as `(ymin, ymax)` Here is an example that creates a plot for a subset of sites for two characters: >>> logodata = pandas.read_csv(io.StringIO( ... '''site show A C ... A101 True 0.8 0.2 ... N102 True 0.7 0.3 ... K103 False 0.1 0.9 ... L104 True 0.8 0.2 ... S105 True 0.5 0.5 ... T106 False 0.2 0.8 ... G107 False 0.4 0.6 ... L108 True 0.7 0.3'''), ... delim_whitespace=True, index_col=False) >>> plotfile = '_siteSubsetGGSeqLogo_test_plot.png' >>> siteSubsetGGSeqLogo(logodata, ... chars=['A', 'C'], ... plotfile=plotfile, ... width=3.5, height=2 ... ) >>> os.path.isfile(plotfile) True Here is the plot created by the code block above: .. image:: _static/_siteSubsetGGSeqLogo_test_plot.png :width: 55% :align: center """ if os.path.isfile(plotfile): os.remove(plotfile) assert set(chars) <= set(char_colors.keys()), \ "`char_colors` not defined for all chars" expectcol = ['site', 'show'] + chars assert set(logodata.columns) >= set(expectcol), \ "`logodata` needs these column: {0}".format(expectcol) assert logodata['show'].any(), "no sites to show" # for each consecutive set of rows not to show, keep just one logodata = logodata[expectcol] logodata['keeprow'] = ( ((logodata['show']) | (logodata['show'] != logodata['show'].shift(1))) ) logodata = logodata.query('keeprow').reset_index() # trim first and last row if they are not to be shown if not logodata.iloc[0]['show']: logodata = logodata.iloc[1 : ].reset_index() if not logodata.iloc[-1]['show']: logodata = logodata.iloc[ : -1] # set site label to empty and data to zero for rows not to show logodata.loc[~logodata['show'], 'site'] = '' logodata.loc[~logodata['show'], chars] = 0 vertlines = logodata.query('~show').index.values + 1 # generate matrix to plot sites = logodata['site'] matrix = r.matrix(logodata.set_index('site')[chars].values.ravel(), ncol=len(sites), dimnames=[chars, sites] ) if ylimits is None: ylimits = rinterface.NULL else: ylimits = FloatVector(ylimits) # make the plot with warnings.catch_warnings(): warnings.simplefilter(SHOW_WARNINGS) _RFUNCS.siteSubsetGGSeqLogo( mat=matrix, plotfile=plotfile, width=width, height=height, xlabels=list(map(str, sites)), vertlines=vertlines, yname=yname, chars=StrVector(chars), char_colors=StrVector([char_colors[x] for x in chars]), ylimits=ylimits ) if not os.path.isfile(plotfile): raise RuntimeError("failed to create {0}".format(plotfile))