def readAndGroupTable( infile, options ): """read table from infile and group. """ fields, table = CSV.ReadTable( infile, with_header = options.has_headers, as_rows = True ) options.columns = getColumns( fields, options.columns ) assert options.group_column not in options.columns converter = float new_fields = [ fields[options.group_column] ] + [ fields[x] for x in options.columns ] if options.group_function == "min": f = min elif options.group_function == "max": f = max elif options.group_function == "sum": f = lambda z: reduce( lambda x,y: x+y, z) elif options.group_function == "mean": f = scipy.mean elif options.group_function == "cat": f = lambda x: ";".join( [ y for y in x if y != "" ] ) converter = str elif options.group_function == "uniq": f = lambda x: ";".join( [ y for y in set(x) if y != "" ] ) converter = str elif options.group_function == "stats": f = lambda x: str(Stats.DistributionalParameters(x)) # update headers new_fields = [ fields[options.group_column] ] for c in options.columns: new_fields += list( map(lambda x: "%s_%s" % (fields[c], x), Stats.DistributionalParameters().getHeaders() ) ) ## convert values to floats (except for group_column) ## Delete rows with unconvertable values and not in options.columns new_table = [] for row in table: skip = False new_row = [ row[options.group_column] ] for c in options.columns: if row[c] == options.missing_value: new_row.append(row[c]) else: try: new_row.append( converter(row[c]) ) except ValueError: skip = True break if not skip: new_table.append(new_row) table = new_table new_rows = CSV.GroupTable( table, group_column = 0, group_function = f ) options.stdout.write("\t".join(new_fields) + "\n") for row in new_rows: options.stdout.write( "\t".join( map(str,row) ) + "\n")
def printHeightsPerTree(values, section, options, prefix_header, prefix_row): if not values: return outfile, is_new = TreeReconciliation.getFile(options, section) if is_new: outfile.write("%s%s\theights\n" % (prefix_header, "\t".join( Stats.DistributionalParameters().getHeaders()))) s = Stats.DistributionalParameters(values) s.setFormat(options.format_branch_length) outfile.write("%s%s\t%s\n" % (prefix_row, str(s), ",".join( map(lambda x: options.format_branch_length % x, values))))
def checkFDR(self, pi0_method): result = Stats.doFDR(self.mPvalues, fdr_level=0.05, pi0_method=pi0_method) R("""require ('qvalue')""") qvalues = R.qvalue(ro.FloatVector(self.mPvalues), fdr_level=0.05, pi0_method=pi0_method) assert qvalues.names[1] == "pi0" assert qvalues.names[2] == "qvalues" assert qvalues.names[5] == "significant" assert qvalues.names[6] == "lambda" r_qvalues = qvalues[2] r_pi0 = qvalues[1][0] self.assertEqual(len(result.mQValues), len(qvalues[2])) self.assertEqual(len(result.mLambda), len(qvalues[6])) self.assertEqual(result.mPi0, r_pi0) for a, b in zip(result.mQValues, r_qvalues): self.assertAlmostEqual(a, b, 2, "unequal: %f != %f" % (a, b)) for a, b in zip(result.mPassed, qvalues[5]): self.assertEqual( a, b, "threshold-passed flag not equal: %s != %s" % (a, b))
def checkFDR(self, **kwargs): old = Stats.doFDR(self.pvalues, **kwargs) # print old.mQValues[:10] # print old.mPi0 new = Stats.doFDRPython(self.pvalues, **kwargs) # print new.mQValues[:10] # print new.mPi0 # self.assertAlmostEqual( old.mPi0, new.mPi0, places=3) self.assertTrue(getRelativeError(old.mPi0, new.mPi0) < self.max_error) for pvalue, a, b in zip(self.pvalues, old.mQValues, new.mQValues): self.assertTrue( getRelativeError(a, b) < self.max_error, "qvalues: relative error %f > %f (pvalue=%f, %f, %f)" % (getRelativeError(a, b), self.max_error, pvalue, a, b))
def __call__(self, track, slice=None): result = odict() merged = None rocs = [] for field in self.mFields: data = [] for replicate in EXPERIMENTS.getTracks(track): statement = "SELECT contig, start, end,%(field)s FROM %(replicate)s_intervals" % locals() data.append(self.get(statement)) idx = [] for x in range(len(data)): i = IndexedGenome.IndexedGenome() for contig, start, end, peakval in data[x]: i.add(contig, start, end, peakval) idx.append(i) def _iter(all): all.sort() last_contig, first_start, last_end, last_value = all[0] for contig, start, end, value in all[1:]: if contig != last_contig or last_end < start: yield (last_contig, first_start, last_end) last_contig, first_start, last_end = contig, start, end else: last_end = max(last_end, end) yield (last_contig, first_start, last_end) if not merged: all = [x for x in itertools.chain(*data)] merged = list(_iter(all)) roc_data = [] for contig, start, end in merged: intervals = [] for i in idx: try: intervals.append(list(i.get(contig, start, end))) except KeyError: continue if len(intervals) == 0: continue is_repro = len([x for x in intervals if x != []]) == len(data) value = max([x[2] for x in itertools.chain(*intervals)]) # fpr, tpr roc_data.append((value, is_repro)) roc_data.sort() roc_data.reverse() roc = zip(*Stats.computeROC(roc_data)) result[field] = odict((("FPR", roc[0]), (field, roc[1]))) return result
def check(self, method): '''check for length equality and elementwise equality.''' a = R['p.adjust'](self.pvalues, method=method) b = Stats.adjustPValues(self.pvalues, method=method) self.assertEqual(len(a), len(b)) for x, y in zip(a, b): self.assertAlmostEqual(x, y)
def computeFDR(all_results, qvalue_method="storey"): '''compute FDR. update GOResult structure with field .fdr ''' # flatten all_results results = [] for key, data in all_results.iteritems(): results.extend(data.mResults.values()) observed_min_pvalues = [min( x.mProbabilityOverRepresentation, x.mProbabilityUnderRepresentation) for x in results] if qvalue_method == "storey": # compute fdr via Storey's method fdr_data = Stats.doFDR(observed_min_pvalues, vlambda=0.1) E.info("estimated proportion of true null hypotheses = %6.4f" % fdr_data.mPi0) if fdr_data.mPi0 < 0.1: E.warn( "estimated proportion of true null hypotheses is " "less than 10%% (%6.4f)" % fdr_data.mPi0) for result, qvalue in zip(results, fdr_data.mQValues): result.fdr = qvalue elif options.qvalue_method == "empirical": assert options.sample > 0, "requiring a sample size of > 0" raise NotImplementedError("empirical needs some work")
def doOldFDR(options, args): """apply fdr to output of annotator.""" # read input annotators = [] for filename in args: infile = open(filename, "r") annotators.append(readAnnotator(infile)) infile.close() # apply filters and create diagnostic plots for filename, data in zip(args, annotators): ninput = len(data) pvalues = [x.mPValue for x in data] vlambda = numpy.arange(0, max(pvalues), 0.05) try: qvalues = Stats.doFDR( pvalues, vlambda=vlambda, fdr_level=options.fdr) except ValueError, msg: E.warn("%s: fdr could not be computed - no filtering: %s" % (filename, msg)) continue qvalues.plot(filename + "_diagnostics.png") data = [x[0] for x in zip(data, qvalues.mPassed) if x[1]]
def __call__(self, track, slice = None ): result = odict() merged = None rocs = [] for field in self.mFields: data = [] for replicate in EXPERIMENTS.getTracks( track ): statement = "SELECT contig, start, end,%(field)s FROM %(replicate)s_intervals" % locals() data.append( self.get( statement) ) idx = [] for x in range(len(data)): i = IndexedGenome.IndexedGenome() for contig, start, end, peakval in data[x]: i.add( contig, start, end, peakval ) idx.append( i ) def _iter( all ): all.sort() last_contig, first_start, last_end, last_value = all[0] for contig, start, end, value in all[1:]: if contig != last_contig or last_end < start: yield (last_contig, first_start, last_end) last_contig, first_start, last_end = contig, start, end else: last_end = max(last_end, end ) yield (last_contig, first_start, last_end) if not merged: all = [ x for x in itertools.chain( *data ) ] merged = list( _iter(all) ) roc_data = [] for contig, start, end in merged: intervals = [] for i in idx: try: intervals.append( list(i.get( contig, start, end )) ) except KeyError: continue if len(intervals) == 0: continue is_repro = len( [ x for x in intervals if x != [] ] ) == len(data) value = max( [ x[2] for x in itertools.chain( *intervals )] ) # fpr, tpr roc_data.append( (value, is_repro) ) roc_data.sort() roc_data.reverse() roc = zip(*Stats.computeROC( roc_data )) result[field] = odict( (("FPR", roc[0]), (field,roc[1])) ) return result
def checkFDR(self, pi0_method): result = Stats.doFDR( self.mPvalues, fdr_level=0.05, pi0_method=pi0_method) R("""require ('qvalue')""") qvalues = R.qvalue(ro.FloatVector(self.mPvalues), fdr_level=0.05, pi0_method=pi0_method) assert qvalues.names[1] == "pi0" assert qvalues.names[2] == "qvalues" assert qvalues.names[5] == "significant" assert qvalues.names[6] == "lambda" r_qvalues = qvalues[2] r_pi0 = qvalues[1][0] self.assertEqual(len(result.mQValues), len(qvalues[2])) self.assertEqual(len(result.mLambda), len(qvalues[6])) self.assertEqual(result.mPi0, r_pi0) for a, b in zip(result.mQValues, r_qvalues): self.assertAlmostEqual(a, b, 2, "unequal: %f != %f" % (a, b)) for a, b in zip(result.mPassed, qvalues[5]): self.assertEqual( a, b, "threshold-passed flag not equal: %s != %s" % (a, b))
def computeFDR(all_results, qvalue_method="storey"): '''compute FDR. update GOResult structure with field .fdr ''' # flatten all_results results = [] for key, data in all_results.iteritems(): results.extend(data.mResults.values()) observed_min_pvalues = [ min(x.mProbabilityOverRepresentation, x.mProbabilityUnderRepresentation) for x in results ] if qvalue_method == "storey": # compute fdr via Storey's method fdr_data = Stats.doFDR(observed_min_pvalues, vlambda=0.1) E.info("estimated proportion of true null hypotheses = %6.4f" % fdr_data.mPi0) if fdr_data.mPi0 < 0.1: E.warn("estimated proportion of true null hypotheses is " "less than 10%% (%6.4f)" % fdr_data.mPi0) for result, qvalue in zip(results, fdr_data.mQValues): result.fdr = qvalue elif options.qvalue_method == "empirical": assert options.sample > 0, "requiring a sample size of > 0" raise NotImplementedError("empirical needs some work")
def testLRT(self): """test that the false positive rate is in the same order as mSignificance. Sample from a normal distribution and compare two models: 1. mean estimated = complex model (1 df) 2. mean given = simple model (0 df) Likelihood = P(model | data) """ simple_np = 0 complex_np = 1 npassed = 0 for replicate in range(0, self.mNumReplicates): sample = scipy.stats.norm.rvs( size=self.mNumSamples, loc=0.0, scale=1.0) mean = scipy.mean(sample) complex_ll = numpy.sum( numpy.log(scipy.stats.norm.pdf(sample, loc=mean, scale=1.0))) simple_ll = numpy.sum( numpy.log(scipy.stats.norm.pdf(sample, loc=0.0, scale=1.0))) a = Stats.doLogLikelihoodTest(complex_ll, complex_np, simple_ll, simple_np, significance_threshold=self.mSignificance) if a.mPassed: npassed += 1 r = float(npassed) / self.mNumReplicates self.assertAlmostEqual(self.mSignificance, r, places=self.nplaces)
def process( self, contig, start, end, reads, qualities ): self.mOutFile.write( "%s\t%s\t%i\t%i\t%i\t%i\t%i\t%s\n" % (self.mOutputId, contig, start, end, end - start, len(reads), len(qualities), str(Stats.DistributionalParameters( qualities ) )))
def process(self, contig, start, end, reads, qualities): aligned = filter(lambda x: x > 0, reads) self.mOutFile.write( "%s\t%s\t%i\t%i\t%i\t%i\t%i\t%s\n" % (self.mOutputId, contig, start, end, end - start, len(reads), len(aligned), str(Stats.DistributionalParameters(aligned))))
def checkFDR(self, **kwargs): old = Stats.doFDR(self.pvalues, **kwargs) # print old.mQValues[:10] # print old.mPi0 new = Stats.doFDRPython(self.pvalues, **kwargs) # print new.mQValues[:10] # print new.mPi0 # self.assertAlmostEqual( old.mPi0, new.mPi0, places=3) self.assertTrue(getRelativeError(old.mPi0, new.mPi0) < self.max_error) for pvalue, a, b in zip(self.pvalues, old.mQValues, new.mQValues): self.assertTrue(getRelativeError(a, b) < self.max_error, "qvalues: relative error %f > %f (pvalue=%f, %f, %f)" % (getRelativeError(a, b), self.max_error, pvalue, a, b))
def process(self, contig, start, end, reads, qualities): aligned = [x for x in reads if x > 0] self.mOutFile.write("%s\t%s\t%i\t%i\t%i\t%i\t%i\t%s\n" % (self.mOutputId, contig, start, end, end - start, len(reads), len(aligned), str(Stats.DistributionalParameters(aligned))))
def writeResults(outfile, results): fields = ("wall", "user", "sys", "cuser", "csys", "nchunks") outfile.write("host\t%s\n" % "\t".join([ "%s_%s" % (x, y) for x, y in itertools.product(fields, Stats.Summary().getHeaders()) ])) hosts = results.keys() hosts.sort() for host in hosts: result = results[host] outfile.write("%s" % host) for f in fields: d = [y.__getitem__(f) for y in result] outfile.write("\t%s" % Stats.Summary(d)) outfile.write("\n")
def printHeightsPerSpecies(values, section, options, prefix_header, prefix_row): if not values: return ## distributions of distance to node outfile, is_new = TreeReconciliation.getFile(options, section) if is_new: outfile.write("%sspecies\t%s\theights\n" % (prefix_header, "\t".join( Stats.DistributionalParameters().getHeaders()))) for species in sorted(values.keys()): s = Stats.DistributionalParameters(values[species]) s.setFormat(options.format_branch_length) outfile.write("%s%s\t%s\t%s\n" % (prefix_row, species, str(s), ",".join( map(lambda x: options.format_branch_length % x, values[species]))))
def testAgainstQValue(self): R.assign("pvalues", self.pvalues) qvalue = R('''qvalue( pvalues )''') r_qvalues = qvalue[2] r_pi0 = qvalue[1][0] new = Stats.doFDRPython(self.pvalues) self.assertTrue(getRelativeError(r_pi0, new.mPi0) < self.max_error) for a, b in zip(r_qvalues, new.mQValues): self.assertAlmostEqual(a, b, places=self.nplaces)
def __str__(self): single_exon_transcripts = 0 exons_per_transcript = [] intron_sizes = [] transcript_lengths = [] exon_sizes = [] for x in self.counts_exons_per_transcript.values(): x.sort() x = Intervals.combine(x) transcript_lengths.append(x[-1][1] - x[0][0]) exons_per_transcript.append(len(x)) for start, end in x: exon_sizes.append(end - start) if len(x) == 1: single_exon_transcripts += 1 continue last_end = x[0][1] for start, end in x[1:]: intron_sizes.append(start - last_end) last_end = end return "\t".join( map(str, ( len(self.counts_gene_ids), len(self.counts_transcript_ids), single_exon_transcripts, Stats.Summary(exons_per_transcript), Stats.Summary(exon_sizes), Stats.Summary(intron_sizes), Stats.Summary(transcript_lengths), )))
def process(self, contig, start, end, reads, qualities): entry = GTF.Entry() entry.start, entry.end = start, end entry.gene_id = self.mIdFormat % id entry.transcript_id = entry.gene_id entry.contig = contig entry.feature = "exon" entry.source = "maq" read_stats = Stats.Summary(reads) entry.score = "%5.2f" % read_stats['mean'] self.mOutFile.write(str(entry) + "\n")
def testLRT(self): """test that the false positive rate is in the same order as mSignificance. Sample from a normal distribution and compare two models: 1. mean estimated = complex model (1 df) 2. mean given = simple model (0 df) Likelihood = P(model | data) """ simple_np = 0 complex_np = 1 npassed = 0 for replicate in range(0, self.mNumReplicates): sample = scipy.stats.norm.rvs(size=self.mNumSamples, loc=0.0, scale=1.0) mean = scipy.mean(sample) complex_ll = numpy.sum( numpy.log(scipy.stats.norm.pdf(sample, loc=mean, scale=1.0))) simple_ll = numpy.sum( numpy.log(scipy.stats.norm.pdf(sample, loc=0.0, scale=1.0))) a = Stats.doLogLikelihoodTest( complex_ll, complex_np, simple_ll, simple_np, significance_threshold=self.mSignificance) if a.mPassed: npassed += 1 r = float(npassed) / self.mNumReplicates self.assertAlmostEqual(self.mSignificance, r, places=self.nplaces)
def decorator_max_score(values, start, end, contig): """compute minumum of values.""" d = Stats.DistributionalParameters(values) return d['max'], str(d)
reference_id = x - 1 elif options.mode == "1xn": reference_result = first_result reference_id = 0 if reference_result.mNumParameters >= result.mNumParameters: if options.loglevel >= 1: options.stdlog.write( "number of parameters of full model not increased (null=%i, full=%i).\n" % (reference_result.mNumParameters, result.mNumParameters)) continue lrt = Stats.doLogLikelihoodTest( result.mLogLikelihood, result.mNumParameters, reference_result.mLogLikelihood, reference_result.mNumParameters, options.significance_threshold) if lrt.mPassed: c = "passed" else: c = "failed" options.stdout.write("%s%i\t%i\t%s\t%f\t%i\t%f\t%i\t%5.2e\n" % ( prefix_row, reference_id, x, c, lrt.mFullLogLikelihood,
lnl_simple = float(row['%s:lnL' % b]) df_complex = map_model2params[a] df_simple = map_model2params[b] if options.loglevel >= 3: options.stdlog.write("# testing %s: ll=%f,df=%i versus %s:lnl=%f,df=%i\n" %\ (a, lnl_complex,df_complex, b, lnl_simple, df_simple)) if lnl_complex < lnl_simple: nerrors += 1 options.stdout.write( "\tna\tna" ) continue lrt = Stats.doLogLikelihoodTest( lnl_complex, df_complex, lnl_simple, df_simple ) if lrt.mPassed: stats[(a,b)] += 1 options.stdout.write( "\t%s\t%5.2e" % ( Stats.getSignificance( lrt.mProbability), lrt.mProbability ) ) options.stdout.write( "\n" ) noutput += 1 options.stdout.write( "npassed" ) for a, b in tests: options.stdout.write( "\t%i\t%5.2f" % (stats[(a, b)], 100.0 * stats[(a,b)] / noutput ) ) options.stdout.write( "\n" )
options.stdout.write("%i" % nmethod) options.stdout.write("\t%i" % (result.mNumSequences )) npassed = 0 for model in options.models: sites = result.mSites[model] ## do significance test full_model, null_model = model, map_nested_models[model] lrt = Stats.doLogLikelihoodTest( result.mSites[full_model].mLogLikelihood, result.mSites[full_model].mNumParameters, result.mSites[null_model].mLogLikelihood, result.mSites[null_model].mNumParameters, options.significance_threshold ) x = 0 for analysis in options.analysis: if analysis == "neb": s = set(map( extract_f, filter( filter_f, sites.mNEB.mPositiveSites))) elif analysis == "beb": s = set(map( extract_f, filter( filter_f, sites.mBEB.mPositiveSites))) options.stdout.write("\t%i" % ( len(s) ) ) if not lrt.mPassed:
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: table2table.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("transpose", "normalize-by-max", "normalize-by-value", "multiply-by-value", "percentile", "remove-header", "normalize-by-table", "upper-bound", "lower-bound", "kullback-leibler", "expand", "compress", "fdr", "grep"), help="""actions to perform on table.""") parser.add_option("-s", "--scale", dest="scale", type="float", help="factor to scale matrix by.") parser.add_option("-f", "--format", dest="format", type="string", help="output number format.") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="Parameters for various functions.") parser.add_option("-t", "--headers", dest="has_headers", action="store_true", help="matrix has row/column headers.") parser.add_option("--transpose", dest="transpose", action="store_true", help="transpose table.") parser.add_option("--set-transpose-field", dest="set_transpose_field", type="string", help="set first field (row 1 and col 1) to this value [%default].") parser.add_option("--transpose-format", dest="transpose_format", type="choice", choices=("default", "separated", ), help="input format of un-transposed table") parser.add_option("--expand", dest="expand_table", action="store_true", help="expand table - multi-value cells with be expanded over several rows.") parser.add_option("--no-headers", dest="has_headers", action="store_false", help="matrix has no row/column headers.") parser.add_option("--columns", dest="columns", type="string", help="columns to use.") parser.add_option("--file", dest="file", type="string", help="columns to test from table.", metavar="FILE") parser.add_option("-d", "--delimiter", dest="delimiter", type="string", help="delimiter of columns.", metavar="DELIM") parser.add_option("-V", "--invert-match", dest="invert_match", action="store_true", help="invert match.") parser.add_option("--sort-by-rows", dest="sort_rows", type="string", help="output order for rows.") parser.add_option("-a", "--value", dest="value", type="float", help="value to use for various algorithms.") parser.add_option("--group", dest="group_column", type="int", help="group values by column. Supply an integer column [default=%default]") parser.add_option("--group-function", dest="group_function", type="choice", choices=( "min", "max", "sum", "mean", "stats", "cat", "uniq"), help="function to group values by.") parser.add_option("--join-table", dest="join_column", type="int", help="join rows in a table by columns.") parser.add_option("--collapse-table", dest="collapse_table", type="string", help="collapse a table. Value determines the missing variable [%default].") parser.add_option("--join-column-name", dest="join_column_name", type="int", help="use this column as a prefix.") parser.add_option("--flatten-table", dest="flatten_table", action="store_true", help="flatten a table [%default].") parser.add_option("--as-column", dest="as_column", action="store_true", help="output table as a single column.") parser.add_option("--split-fields", dest="split_fields", action="store_true", help="split fields.") parser.add_option("--separator", dest="separator", type="string", help="separator for multi-valued fields [default=%default].") parser.add_option("--fdr-method", dest="fdr_method", type="choice", choices=( "BH", "bonferroni", "holm", "hommel", "hochberg", "BY"), help="method to perform multiple testing correction by controlling the fdr [default=%default].") parser.add_option("--fdr-add-column", dest="fdr_add_column", type="string", help="add new column instead of replacing existing columns. " "The value of the option will be used as prefix if there are multiple columns [%default]") # IMS: add option to use a column as the row id in flatten parser.add_option("--id-column", dest="id_column", type="string", help="list of column(s) to use as the row id when flattening the table. " "If None, then row number is used. [default=%default].") parser.add_option("--variable-name", dest="variable_name", type="string", help="the column header for the 'variable' column when flattening [default=%default].") parser.add_option("--value-name", dest="value_name", type="string", help="the column header for the 'value' column when flattening [default=%default].") parser.set_defaults( methods=[], scale=1.0, has_headers=True, format="%5.2f", value=0.0, parameters="", columns="all", transpose=False, set_transpose_field=None, transpose_format="default", group=False, group_column=0, group_function="mean", missing_value="na", sort_rows=None, flatten_table=False, collapse_table=None, separator=";", expand=False, join_column=None, join_column_name=None, compute_fdr=None, as_column=False, fdr_method="BH", fdr_add_column=None, id_column=None, variable_name="column", value_name="value", file=None, delimiter="\t", invert_match=False, ) (options, args) = E.Start(parser, add_pipe_options=True) options.parameters = options.parameters.split(",") if options.group_column: options.group = True options.group_column -= 1 ###################################################################### ###################################################################### ###################################################################### # if only to remove header, do this quickly if options.methods == ["remove-header"]: first = True for line in options.stdin: if line[0] == "#": continue if first: first = False continue options.stdout.write(line) elif options.transpose or "transpose" in options.methods: readAndTransposeTable(options.stdin, options) elif options.flatten_table: # IMS: bug fixed to make work. Also added options for keying on a particular # and adding custom column headings fields, table = CSV.ReadTable( options.stdin, with_header=options.has_headers, as_rows=True) options.columns = getColumns(fields, options.columns) if options.id_column: id_columns = map( lambda x: int(x) - 1, options.id_column.split(",")) id_header = "\t".join([fields[id_column] for id_column in id_columns]) options.columns = [ x for x in options.columns if x not in id_columns] else: id_header = "row" options.stdout.write( "%s\t%s\t%s\n" % (id_header, options.variable_name, options.value_name)) for x, row in enumerate(table): if options.id_column: row_id = "\t".join([row[int(x) - 1] for x in options.id_column.split(",")]) else: row_id = str(x) for y in options.columns: options.stdout.write( "%s\t%s\t%s\n" % (row_id, fields[y], row[y])) elif options.as_column: fields, table = CSV.ReadTable( options.stdin, with_header=options.has_headers, as_rows=True) options.columns = getColumns(fields, options.columns) table = zip(*table) options.stdout.write("value\n") for column in options.columns: options.stdout.write("\n".join(table[column]) + "\n") elif options.split_fields: # split comma separated fields fields, table = CSV.ReadTable(options.stdin, with_header=options.has_headers, as_rows=True) options.stdout.write("%s\n" % ("\t".join(fields))) for row in table: row = [x.split(options.separator) for x in row] for d in itertools.product(*row): options.stdout.write("%s\n" % "\t".join(d)) elif options.group: readAndGroupTable(options.stdin, options) elif options.join_column: readAndJoinTable(options.stdin, options) elif options.expand_table: readAndExpandTable(options.stdin, options) elif options.collapse_table is not None: readAndCollapseTable(options.stdin, options, options.collapse_table) elif "grep" in options.methods: options.columns = map(lambda x: int(x) - 1, options.columns.split(",")) patterns = [] if options.file: infile = open(options.file, "r") for line in infile: if line[0] == "#": continue patterns.append(line[:-1].split(options.delimiter)[0]) else: patterns = args for line in options.stdin: data = line[:-1].split(options.delimiter) found = False for c in options.columns: if data[c] in patterns: found = True break if (not found and options.invert_match) or (found and not options.invert_match): print line[:-1] else: ###################################################################### ###################################################################### ###################################################################### # Apply remainder of transformations fields, table = CSV.ReadTable( options.stdin, with_header=options.has_headers, as_rows=False) # convert columns to list table = [list(x) for x in table] ncols = len(fields) if len(table) == 0: raise ValueError("table is empty") nrows = len(table[0]) E.info("processing table with %i rows and %i columns" % (nrows, ncols)) options.columns = getColumns(fields, options.columns) # convert all values to float for c in options.columns: for r in range(nrows): try: table[c][r] = float(table[c][r]) except ValueError: continue for method in options.methods: if method == "normalize-by-value": value = float(options.parameters[0]) del options.parameters[0] for c in options.columns: table[c] = map(lambda x: x / value, table[c]) elif method == "multiply-by-value": value = float(options.parameters[0]) del options.parameters[0] for c in options.columns: table[c] = map(lambda x: x * value, table[c]) elif method == "normalize-by-max": for c in options.columns: m = max(table[c]) table[c] = map(lambda x: x / m, table[c]) elif method == "kullback-leibler": options.stdout.write("category1\tcategory2\tkl1\tkl2\tmean\n") for x in range(0, len(options.columns) - 1): for y in range(x + 1, len(options.columns)): c1 = options.columns[x] c2 = options.columns[y] e1 = 0 e2 = 0 for z in range(nrows): p = table[c1][z] q = table[c2][z] e1 += p * math.log(p / q) e2 += q * math.log(q / p) options.stdout.write("%s\t%s\t%s\t%s\t%s\n" % (fields[c1], fields[c2], options.format % e1, options.format % e2, options.format % ((e1 + e2) / 2))) E.Stop() sys.exit(0) elif method == "rank": for c in options.columns: tt = table[c] t = zip(tt, range(nrows)) t.sort() for i, n in zip(map(lambda x: x[1], t), range(nrows)): tt[i] = n elif method in ("lower-bound", "upper-bound"): boundary = float(options.parameters[0]) del options.parameters[0] new_value = float(options.parameters[0]) del options.parameters[0] if method == "upper-bound": for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ table[c][r] > boundary: table[c][r] = new_value else: for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ table[c][r] < boundary: table[c][r] = new_value elif method == "fdr": pvalues = [] for c in options.columns: pvalues.extend(table[c]) assert max(pvalues) <= 1.0, "pvalues > 1 in table: max=%s" % str( max(pvalues)) assert min(pvalues) >= 0, "pvalue < 0 in table: min=%s" % str( min(pvalues)) # convert to str to avoid test for float downstream qvalues = map( str, Stats.adjustPValues(pvalues, method=options.fdr_method)) if options.fdr_add_column is None: x = 0 for c in options.columns: table[c] = qvalues[x:x + nrows] x += nrows else: # add new column headers if len(options.columns) == 1: fields.append(options.fdr_add_column) else: for co in options.columns: fields.append(options.fdr_add_column + fields[c]) x = 0 for c in options.columns: # add a new column table.append(qvalues[x:x + nrows]) x += nrows ncols += len(options.columns) elif method == "normalize-by-table": other_table_name = options.parameters[0] del options.parameters[0] other_fields, other_table = CSV.ReadTable( open(other_table_name, "r"), with_header=options.has_headers, as_rows=False) # convert all values to float for c in options.columns: for r in range(nrows): try: other_table[c][r] = float(other_table[c][r]) except ValueError: continue # set 0s to 1 in the other matrix for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ isinstance(other_table[c][r], float) and \ other_table[c][r] != 0: table[c][r] /= other_table[c][r] else: table[c][r] = options.missing_value # convert back for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float): table[c][r] = options.format % table[c][r] options.stdout.write("\t".join(fields) + "\n") if options.sort_rows: old2new = {} for r in range(nrows): old2new[table[0][r]] = r for x in options.sort_rows.split(","): if x not in old2new: continue r = old2new[x] options.stdout.write( "\t".join([table[c][r] for c in range(ncols)]) + "\n") else: for r in range(nrows): options.stdout.write( "\t".join([table[c][r] for c in range(ncols)]) + "\n") E.Stop()
def doFDR(options, args): # read input annotators = [] for filename in args: infile = open(filename, "r") annotators.append(readAnnotator(infile)) infile.close() do_filter = options.fdr_qvalue is not None extra_headers = set() for data, fdr, synonyms, input_files in annotators: for key, value in input_files.iteritems(): extra_headers.add(key) extra_headers = sorted(list(extra_headers)) # note: id used to be file options.stdout.write("id\tover\tcategory\tpvalue\tfold\tobserved\texpected\tci95low\tci95high\tstddev\tfdr\tqvalue\t%s\n" % "\t".join(extra_headers)) # apply filters and create diagnostic plots for filename, vv in zip(args, annotators): data, fdr, synonyms, input_files = vv ninput = len(data) E.info("processing %s with %i data points" % (filename, ninput)) no_fdr = False if options.fdr_method in ("annotator", "annotator-estimate"): pvalues = fdr.keys() pvalues.sort() pvalues.reverse() for pvalue in pvalues: try: d = fdr[pvalue]["Significant"] except KeyError: continue if d.mObserved == 0: E.info("no data after fdr") break elif d.mAverage / d.mObserved < options.fdr_qvalue: E.info("filtering with P-value of %f" % pvalue) if do_filter: data = [x for x in data if x.mPValue < pvalue] for d in data: if d.mPValue < pvalue: d.mFDR = 1 d.mQValue = options.fdr_qvalue break else: E.warn("fdr could not be computed - compute more samples (at P = %f, actual fdr=%f)" % (pvalue, d.mAverage / d.mObserved)) no_fdr = True if options.fdr_method == "estimate" or (options.fdr_method == "annotator-estimate" and no_fdr): E.info("estimating FDR from observed P-Values") pvalues = [x.mPValue for x in data] vlambda = numpy.arange(0, max(pvalues), 0.05) try: qvalues = Stats.doFDR( pvalues, vlambda=vlambda, fdr_level=options.fdr_qvalue) except ValueError, msg: E.warn("fdr could not be computed - no output: %s" % msg) no_fdr = True else: for d, p, q in zip(data, qvalues.mPassed, qvalues.mQValues): if p: d.mFDR = 1 d.mQValue = q if do_filter: data = [x[0] for x in zip(data, qvalues.mPassed) if x[1]] if do_filter and no_fdr: data = [] nremoved = ninput - len(data) E.info("%s: %i data points left, %i removed" % (filename, len(data), nremoved)) extra_values = [] for key in extra_headers: if key in input_files: extra_values.append(input_files[key]) else: extra_values.append("") extra_values = "\t".join(map(str, extra_values)) for d in data: if d.mFoldChange < 1: code = "-" else: code = "+" try: id = re.search(options.regex_id, filename).groups()[0] except AttributeError: id = filename options.stdout.write("%s\t%s\t%s\t%e\t%6.4f\t%f\t%f\t%f\t%f\t%f\t%i\t%e\t%s\n" % (id, code, d.mAnnotation, d.mPValue, d.mFoldChange, d.mObserved, d.mExpected, d.mCI95[0], d.mCI95[1], d.mStdDev, d.mFDR, d.mQValue, extra_values))
def pairwiseGOEnrichment(results_per_genelist, labels, test_ontology, go2info, options): '''compute pairwise enrichment between sets. The purpose of this method is to find if there are categories that are differently enriched in a pair of gene lists. The appropriate test here is the Chi-Squared test. The assumption is that the background set is the same in all gene lists. The workflow is thus:: for each combination of two gene lists: for each GO category: get counts in foreground, total counts of foreground compute chi-square enrichment output save P-value apply fdr - output significant differences. ''' dicts = [dict(x) for x in results_per_genelist] PairResult = collections.namedtuple("PairResult", "goid set1 set2 counts1 total1 pvalue1 qvalue1 counts2 total2 pvalue2 qvalue2 pvalue qvalue description") outfile = getFileName(options, go=test_ontology, section='summary', set="pairs") outfile.write( "set1\tset2\ttotal1\ttotal2\tshared\tskipped\ttested\tsignificant\tinsignificant\n") results = [] total = len(dicts) * (len(dicts) - 1) / 2 iteration = 0 min_observed_counts = options.pairs_min_observed_counts for x, genelist1 in enumerate(sorted(dicts)): x_go_categories = set(genelist1.keys()) for y, genelist2 in enumerate(sorted(dicts[:x])): iteration += 1 if iteration % 10 == 0: E.info("iteration: %i/%i (%5.2f%%)" % (iteration, total, 100.0 * iteration / total)) y_go_categories = set(genelist2.keys()) shared = x_go_categories.intersection(y_go_categories) c = E.Counter() for category in shared: c.shared += 1 xx = genelist1[category] yy = genelist2[category] # discard all tests with few observations in the observed # counts if xx.mSampleCountsCategory < min_observed_counts and yy.mSampleCountsCategory < min_observed_counts: c.skipped += 1 continue observed = (xx.mSampleCountsCategory, yy.mSampleCountsCategory) aa, bb, cc, dd = \ (xx.mSampleCountsCategory, yy.mSampleCountsCategory, xx.mSampleCountsTotal - xx.mSampleCountsCategory, yy.mSampleCountsTotal - yy.mSampleCountsCategory) if cc == dd == 0: c.skipped += 1 continue c.tested += 1 fisher, pvalue = scipy.stats.fisher_exact(numpy.array( ((aa, bb), (cc, dd)))) if pvalue < 0.05: c.significant_pvalue += 1 else: c.insignificant_pvalue += 1 results.append(PairResult._make((category, labels[x], labels[y], xx.mSampleCountsCategory, xx.mSampleCountsTotal, xx.mPValue, xx.mQValue, yy.mSampleCountsCategory, yy.mSampleCountsTotal, yy.mPValue, yy.mQValue, pvalue, 1.0, go2info[category].mDescription))) outfile.write("\t".join(map(str, (labels[x], labels[y], len(x_go_categories), len(y_go_categories), c.shared, c.skipped, c.tested, c.significant_pvalue, c.insignicant_pvalue))) + "\n") if options.output_filename_pattern: outfile.close() if options.fdr: pvalues = [x.pvalue for x in results] if options.qvalue_method == "storey": # compute fdr via Storey's method try: fdr_data = Stats.doFDR(pvalues) except ValueError as msg: E.warn("failure in q-value computation: %s" % msg) E.warn("reverting to Bonferroni correction") method = "bonf" fdr_data = Stats.FDRResult() l = float(len(pvalues)) fdr_data.mQValues = [min(1.0, x * l) for x in pvalues] qvalues = fdr_data.mQValues else: qvalues = R['p.adjust'](pvalues, method=options.qvalue_method) # update qvalues results = [x._replace(qvalue=y) for x, y in zip(results, qvalues)] outfile = getFileName(options, go=test_ontology, section='pairs', set="pairs") outfile.write("\t".join(PairResult._fields) + "\n") for result in results: outfile.write("\t".join(map(str, result)) + "\n") if options.output_filename_pattern: outfile.close()
def Collect(infile, with_headers=False, annotator_format=False, use_annotator_fdr=False, delims="", ignore="", max_pvalue=1.0, max_qvalue=None): """read input table.""" data = [] lines = [x for x in infile.readlines() if x[0] != "#"] if len(lines) == 0: return data if with_headers: del lines[0] if annotator_format: lines = [line for line in lines if not line.startswith("Iteration")] annotator_fdr = {} annotator_level = None for line in lines: if len(line) == 1: continue # skip trailing blank lines if line.startswith("--"): if line.startswith("-- False"): annotator_level = float( re.search("-- False Discovery summary for p-value (.+):", line).groups()[0]) annotator_fdr[annotator_level] = {} elif line.startswith("-- Category"): pass else: if re.search("insufficiently", line): continue dd = re.split("\s+", line[4:-1]) d = DataFDR() d.mObserved, d.mAverage, d.mMedian, d.m95 = list(map( float, dd[1:])) annotator_fdr[annotator_level][dd[0]] = d continue else: if line[0] == "Z": continue # skip header if len(line[:-1].split('\t')) != 9: continue # HACK: accounts for a bug in Annotator output try: (z, percentchange, pvalue, observed, expected, low95, up95, stddev, description) = line[:-1].split('\t')[:9] except ValueError: raise ValueError("# parsing error in line: %s" % line[:-1]) d = DataPoint() d.mAnnotation = description d.mPValue = float(pvalue) d.mFoldChange = 1.0 + float(percentchange) / 100.0 data.append(d) else: for line in lines: try: (code, goid, scount, stotal, spercent, bcount, btotal, bpercent, ratio, pover, punder, goid, category, description) = line[:-1].split("\t")[:14] except ValueError: raise ValueError("# parsing error in line: %s" % line[:-1]) if code == "+": p = pover else: p = punder d = DataPoint() d.mAnnotation = description d.mPValue = float(p) d.mFoldChange = float(spercent) / float(bpercent) data.append(d) # apply filters for c in delims: for d in data: d.mAnnotation = d.mAnnotation.split(c)[0] for c in ignore: for d in data: d.mAnnotation = d.mAnnotation.replace(c, '') ninput = len(data) no_fdr = False # apply filters if ninput > 0: if max_qvalue is not None: if use_annotator_fdr: pvalues = list(annotator_fdr.keys()) pvalues.sort() pvalues.reverse() for pvalue in pvalues: try: d = annotator_fdr[pvalue]["Significant"] except KeyError: continue if d.mObserved == 0: E.info("no data remaining after fdr filtering") data = [] break elif d.mAverage / d.mObserved < max_qvalue: E.info("filtering with P-value of %f" % pvalue) data = [x for x in data if x.mPValue < pvalue] break else: E.warn("fdr could not be computed - compute more " "samples (at P = %f, actual fdr=%f)" % (pvalue, d.mAverage / d.mObserved)) no_fdr = True if no_fdr: if use_annotator_fdr: E.info("estimating FDR from observed P-Values") pvalues = [x.mPValue for x in data] vlambda = numpy.arange(0, max(pvalues), 0.05) try: qvalues = Stats.doFDR( pvalues, vlambda=vlambda, fdr_level=max_qvalue) except ValueError as msg: E.warn( "fdr could not be computed - no filtering: %s" % msg) no_fdr = True else: data = [x[0] for x in zip(data, qvalues.mPassed) if x[1]] elif max_pvalue is not None: data = [x for x in data if x.mPValue < max_pvalue] if no_fdr: data = [] nremoved = ninput - len(data) return data, nremoved, no_fdr
else: for c in options.columns: for r in range(nrows): if type(table[c][r]) == types.FloatType and \ table[c][r] < boundary: table[c][r] = new_value elif method == "fdr": pvalues = [] for c in options.columns: pvalues.extend( table[c] ) assert max(pvalues) <= 1.0, "pvalues > 1 in table" assert min(pvalues) >= 0, "pvalue < 0 in table" # convert to str to avoid test for float downstream qvalues = map(str, Stats.adjustPValues( pvalues, method = options.fdr_method )) x = 0 for c in options.columns: table[c] = qvalues[x:x+nrows] x += nrows elif method == "normalize-by-table": other_table_name = options.parameters[0] del options.parameters[0] other_fields, other_table = CSV.ReadTable( open(other_table_name, "r"), with_header = options.has_headers, as_rows = False ) # convert all values to float for c in options.columns: for r in range(nrows):
def decorator_median_length(intervals, start, end, contig, fasta): """compute length distribution.""" d = Stats.DistributionalParameters([x[1] - x[0] for x in intervals]) return d['median'], str(d)
if options.mode == "pairs": reference_result = last_result reference_id = x - 1 elif options.mode == "1xn": reference_result = first_result reference_id = 0 if reference_result.mNumParameters >= result.mNumParameters: if options.loglevel >= 1: options.stdlog.write("number of parameters of full model not increased (null=%i, full=%i).\n" % ( reference_result.mNumParameters, result.mNumParameters)) continue lrt = Stats.doLogLikelihoodTest( result.mLogLikelihood, result.mNumParameters, reference_result.mLogLikelihood, reference_result.mNumParameters, options.significance_threshold) if lrt.mPassed: c = "passed" else: c = "failed" options.stdout.write("%s%i\t%i\t%s\t%f\t%i\t%f\t%i\t%5.2e\n" % (prefix_row, reference_id, x, c, lrt.mFullLogLikelihood, lrt.mFullNumParameters, lrt.mNullLogLikelihood, lrt.mNullNumParameters, lrt.mProbability, )) last_result = result x += 1
def computeFDRs(go_results, foreground, background, options, test_ontology, gene2go, go2info): pairs = sorted(go_results.mResults.items()) E.info("calculating the FDRs using method `%s`" % options.qvalue_method) samples = None observed_min_pvalues = [min(x[1].mProbabilityOverRepresentation, x[1].mProbabilityUnderRepresentation) for x in pairs] fdrs = {} method = options.qvalue_method if options.qvalue_method == "storey": # compute fdr via Storey's method try: fdr_data = Stats.doFDR(observed_min_pvalues) except ValueError as msg: E.warn("failure in q-value computation: %s" % msg) E.warn("reverting to Bonferroni correction") method = "bonf" fdr_data = Stats.FDRResult() l = float(len(observed_min_pvalues)) fdr_data.mQValues = [min(1.0, x * l) for x in observed_min_pvalues] for pair, qvalue in zip(pairs, fdr_data.mQValues): fdrs[pair[0]] = (qvalue, 1.0, 1.0) elif options.qvalue_method == "empirical": assert options.sample > 0, "requiring a sample size of > 0" ####################################################################### # sampling # for each GO-category: # get maximum and minimum counts in x samples -> calculate minimum/maximum significance # get average and stdev counts in x samples -> calculate z-scores for # test set samples, simulation_min_pvalues = getSamples(gene2go, foreground, background, options, test_ontology, go2info) # compute P-values from sampling observed_min_pvalues.sort() observed_min_pvalues = numpy.array(observed_min_pvalues) sample_size = options.sample for k, v in pairs: if k in samples: s = samples[k] else: raise KeyError("category %s not in samples" % k) # calculate values for z-score if s.mStddev > 0: zscore = abs( float(v.mSampleCountsCategory) - s.mMean) / s.mStddev else: zscore = 0.0 ############################################################# # FDR: # For each p-Value p at node n: # a = average number of nodes in each simulation run with P-Value < p # this can be obtained from the array of all p-values and all nodes # simply divided by the number of samples. # aka: expfpos=experimental false positive rate # b = number of nodes in observed data, that have a P-Value of less than p. # aka: pos=positives in observed data # fdr = a/b pvalue = v.mPValue # calculate values for FDR: # nfdr = number of entries with P-Value better than node. a = 0 while a < len(simulation_min_pvalues) and \ simulation_min_pvalues[a] < pvalue: a += 1 a = float(a) / float(sample_size) b = 0 while b < len(observed_min_pvalues) and \ observed_min_pvalues[b] < pvalue: b += 1 if b > 0: fdr = min(1.0, float(a) / float(b)) else: fdr = 1.0 fdrs[k] = (fdr, a, b) else: qvalues = R['p.adjust']( observed_min_pvalues, method=options.qvalue_method) fdr_data = Stats.FDRResult() fdr_data.mQValues = list(qvalues) for pair, qvalue in zip(pairs, fdr_data.mQValues): fdrs[pair[0]] = (qvalue, 1.0, 1.0) return fdrs, samples, method
def loadGOs(infiles, outfile, tablename): '''import GO results into a single table. This method also computes a global QValue over all tracks, genesets and annotation sets. Arguments --------- infiles : string Output files of several runGO analyses outfile : string Output filename, contains log information tablename : string Table name for storing results. ''' header = False tempf1 = P.getTempFile() pvalues = [] for infile in infiles: indir = infile + ".dir" if not os.path.exists(indir): continue track, geneset, annotationset = re.search("^(\S+)_vs_(\S+)\.(\S+)", infile).groups() for filename in glob.glob(os.path.join(indir, "*.overall")): for line in open(filename, "r"): if line.startswith("#"): continue data = line[:-1].split("\t") if line.startswith("code"): if header: continue tempf1.write("track\tgeneset\tannotationset\t%s" % line) header = True assert data[10] == "pover" and data[ 11] == "punder", "format error, expected pover-punder, got %s-%s" % ( data[10], data[11]) continue tempf1.write("%s\t%s\t%s\t%s" % (track, geneset, annotationset, line)) pvalues.append(min(float(data[10]), float(data[11]))) tempf1.close() E.info("analysing %i pvalues" % len(pvalues)) fdr = Stats.doFDR(pvalues) E.info("got %i qvalues" % len(fdr.mQValues)) qvalues = ["global_qvalue"] + fdr.mQValues tempf2 = P.getTempFile() for line, qvalue in zip(open(tempf1.name, "r"), qvalues): tempf2.write("%s\t%s\n" % (line[:-1], str(qvalue))) tempf2.close() P.load(tempf2.name, outfile, tablename=tablename, options="--allow-empty-file " "--add-index=category " "--add-index=track,geneset,annotationset " "--add-index=geneset " "--add-index=annotationset " "--add-index=goid ") os.unlink(tempf1.name) os.unlink(tempf2.name)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: r_test.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-m", "--method", dest="method", type="choice", help="method to use [t-test=t-test,wilcox=wilcox]", choices=("t-test", "wilcox")) parser.add_option("-1", "--infile", dest="filename_input", type="string", help="input filename with vector of values.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename with vector of values.") parser.add_option("--header", dest="header", type="string", help="""header of value column [default=%default].""") parser.set_defaults( method="t-test", filename_input=None, header="value", ) (options, args) = E.Start(parser, add_pipe_options=True) if options.filename_input: infile = open(options.filename_input, "r") else: infile = sys.stdin values, errors = IOTools.ReadList(infile, map_function=float) if options.filename_input: infile.close() if errors: E.warn("errors in input: %s" % ";".join(map(str, errors))) kwargs = {} xargs = [] for arg in args: if "=" in arg: key, value = arg.split("=") kwargs[key] = value else: xargs.append(arg) if options.filename_input2: infile = open(options.filename_input2, "r") values2, errors2 = IOTools.ReadList(infile, map_function=float) infile.close() else: values2 = None stat = Stats.Summary(values) power, diff_at_power95 = None, None if options.method == "t-test": if values2: result = R.t_test(values, values2, *xargs, **kwargs) else: result = R.t_test(values, *xargs, **kwargs) # compute power of test power = R.power_t_test(n=len(values), delta=abs(stat["mean"]), sd=stat["stddev"], sig_level=0.05)['power'] diff_at_power95 = R.power_t_test(n=len(values), power=0.95, sd=stat["stddev"], sig_level=0.05)['delta'] if options.method == "wilcox": result = R.wilcox_test(values, *xargs, **kwargs) options.stdout.write("%s\t%s\n" % ("key", options.header)) for key, value in sorted(result.items()): if key == "data.name": continue if key == "p.value": options.stdout.write("%s\t%5.2e\n" % (str(key), value)) else: options.stdout.write("%s\t%s\n" % (str(key), str(value))) for key, value in stat.items(): options.stdout.write("%s\t%s\n" % (str(key), str(value))) if power: options.stdout.write("1-power\t%5.2e\n" % (1.0 - power)) options.stdout.write("diff_at_power95\t%f\n" % diff_at_power95) E.Stop()
def analysePolyphen(infile, outfile): '''compute enrichment of SNPs within genes and deleterious SNPs within SNPs within genes. del: enrichment of deleterious snps within snps per gene len: enrichment of snps within genes com: enrichment of deleterious snps within gene ''' table = P.toTable(infile) tablename_map = "polyphen_map" dbhandle = connect() cc = dbhandle.cursor() statement = ''' SELECT i.gene_id, COUNT(DISTINCT map.locus_id) as nsnps, COUNT(DISTINCT case t.prediction when 'possiblydamaging' then map.locus_id when 'probablydamaging' then map.locus_id else NULL end) AS ndeleterious, MAX(s.length) FROM %(table)s as t, %(tablename_map)s as map, annotations.protein_stats as s, annotations.transcript_info as i WHERE map.snp_id = t.snp_id AND i.transcript_id = map.transcript_id AND s.protein_id = map.protein_id GROUP BY i.gene_id ''' % locals() data = cc.execute(statement).fetchall() statement = '''SELECT DISTINCT i.gene_id, MAX(s.length) FROM annotations.transcript_info AS i, annotations.protein_stats AS s WHERE s.protein_id = i.protein_id GROUP BY i.gene_id''' gene_ids = cc.execute(statement).fetchall() total_nsnps = sum([x[1] for x in data]) total_ndel = sum([x[2] for x in data]) total_length = sum([x[1] for x in gene_ids]) del_p = float(total_ndel) / total_nsnps len_p = float(total_nsnps) / total_length com_p = float(total_ndel) / total_length E.info("del: background probability: %i/%i = %f" % (total_ndel, total_nsnps, del_p)) E.info("len: background probability: %i/%i = %f" % (total_nsnps, total_length, len_p)) E.info("com: background probability: %i/%i = %f" % (total_ndel, total_length, com_p)) outf = open(outfile, "w") outf.write("\t".join(("gene_id", "code", "length", "nsnps", "ndel", "del_p", "del_pvalue", "del_qvalue", "len_p", "len_pvalue", "len_qvalue", "com_p", "com_pvalue", "com_qvalue", )) + "\n") del_pvalues, len_pvalues, com_pvalues = [], [], [] for gene_id, nsnps, ndel, length in data: # use -1, because I need P( x >= X) # sf = 1 - cdf and cdf = P( x <= X ), thus sf = 1 - P( x <= X ) = P (x # > X ). del_pvalues.append(scipy.stats.binom.sf(ndel - 1, nsnps, del_p)) len_pvalues.append( scipy.stats.binom.sf(nsnps - 1, int(round(length)), len_p)) com_pvalues.append( scipy.stats.binom.sf(ndel - 1, int(round(length)), com_p)) if len(del_pvalues) > 10: del_qvalues = Stats.doFDR(del_pvalues).mQValues else: E.warn("no FDR computed for del") del_qvalues = del_pvalues if len(len_pvalues) > 10: len_qvalues = Stats.doFDR(len_pvalues).mQValues else: E.warn("no FDR computed for del") len_qvalues = len_pvalues if len(com_pvalues) > 10: com_q = Stats.doFDR(com_pvalues).mQValues else: E.warn("no FDR computed for com") com_qvalues = com_pvalues fdr = PARAMS["polyphen_fdr"] found = set() for a, del_pvalue, del_qvalue, len_pvalue, len_qvalue, com_pvalue, com_qvalue in \ zip(data, del_pvalues, del_qvalues, len_pvalues, len_qvalues, com_pvalues, com_qvalues, ): gene_id, nsnps, ndel, length = a found.add(gene_id) del_p = float(ndel) / nsnps len_p = float(nsnps) / length code = "".join([str(int(x < fdr)) for x in (del_qvalue, len_qvalue, com_qvalue)]) outf.write("\t".join((gene_id, code, "%i" % int(round(length)), "%i" % int(nsnps), "%i" % int(ndel), "%6.4f" % del_p, "%6.4g" % del_pvalue, "%6.4g" % del_qvalue, "%6.4f" % len_p, "%6.4g" % len_pvalue, "%6.4g" % len_qvalue, "%6.4f" % com_p, "%6.4g" % com_pvalue, "%6.4g" % com_qvalue, )) + "\n") # add missing genes: code = "---" for gene_id, length in gene_ids: if gene_id in found: continue outf.write("\t".join((gene_id, code, "%i" % int(round(length)), "%i" % 0, "%i" % 0, "%6.4f" % 0, "%6.4g" % 1, "%6.4g" % 1, "%6.4f" % 0, "%6.4g" % 1, "%6.4g" % 1, "%6.4f" % 0, "%6.4g" % 1, "%6.4g" % 1, )) + "\n") outf.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: codemls2tsv.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("--methods", dest="methods", type="choice", action="append", choices=("summary-numbers", "jalview", "positive-site-table", "positive-site-list", "count-positive-sites"), help="methods for analysis.") parser.add_option("--selection-mode", dest="selection_mode", type="choice", choices=("all", "consistent", "emes"), help="how to select positive sites.") parser.add_option("--prefix", dest="prefix", type="string", help="prefix for rows.") parser.add_option("--pattern-input-filenames", dest="pattern_input_filenames", type="string", help="input pattern.") parser.add_option( "--filter-probability", dest="filter_probability", type="float", help= "threshold for probability above which to include positive sites [default=%default]." ) parser.add_option( "--filter-omega", dest="filter_omega", type="float", help= "threshold for omega above which to include positive sites [default=%default]." ) parser.add_option("--models", dest="models", type="string", help="restrict output to set of site specific models.") parser.add_option("--analysis", dest="analysis", type="string", help="restrict output to set of analysis [beb|neb].") parser.add_option("--significance-threshold", dest="significance_threshold", type="float", help="significance threshold for log-likelihood test.") parser.add_option("--filter-mali", dest="filter_mali", type="choice", choices=("none", "gaps"), help="filter by mali to remove gapped positions.") parser.add_option( "--filename-mali", dest="filename_mali", type="string", help= "filename with multiple alignment used for calculating sites - used for filtering" ) parser.add_option( "--filename-map-mali", dest="filename_map_mali", type="string", help="filename with multiple alignment to map sites onto.") parser.add_option( "--jalview-titles", dest="jalview_titles", type="string", help="comma separated list of jalview annotation titles.") parser.add_option("--jalview-symbol", dest="jalview_symbol", type="string", help="symbol to use in jalview.") parser.set_defaults( methods=[], prefix=None, filter_probability=0, filter_omega=0, models="", analysis="", significance_threshold=0.05, selection_mode="consistent", filename_mali=None, filename_map_mali=None, jalview_symbol="*", jalview_titles="", filter_mali=None, ) (options, args) = E.Start(parser) if options.jalview_titles: options.jalview_titles = options.jalview_titles.split(",") else: options.jalview_titles = args options.models = options.models.split(",") options.analysis = options.analysis.split(",") for a in options.analysis: if a not in ("beb", "neb"): raise "unknown analysis section: '%s', possible values are 'beb' and/or 'neb'" % a for a in options.models: if a not in ("8", "2", "3"): raise "unknown model: '%s', possible values are 2, 3, 8" % a codeml = WrapperCodeML.CodeMLSites() ## filter and extract functions filter_f = lambda x: x.mProbability >= options.filter_probability and x.mOmega >= options.filter_omega extract_f = lambda x: x.mResidue ## read multiple results results = [] ninput, noutput, nskipped = 0, 0, 0 headers = [] for f in args: ninput += 1 try: results.append(codeml.parseOutput(open(f, "r").readlines())) except WrapperCodeML.UsageError: if options.loglevel >= 1: options.stdlog.write("# no input from %s\n" % f) nskipped += 1 continue noutput += 1 headers.append(f) ## map of nested model (key) to more general model map_nested_models = {'8': '7', '2': '1', '3': '0'} if options.filename_mali: mali = Mali.Mali() mali.readFromFile(open(options.filename_mali, "r")) else: mali = None ############################################################### ############################################################### ############################################################### ## use multiple alignment to map residues to a reference mali ## or a sequence. ############################################################### if options.filename_map_mali: if not mali: raise "please supply the input multiple alignment, if residues are to be mapped." ## translate the alignments def translate(s): sequence = s.mString seq = [] for codon in [ sequence[x:x + 3] for x in range(0, len(sequence), 3) ]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) s.mString = "".join(seq) tmali = Mali.Mali() tmali.readFromFile(open(options.filename_mali, "r")) tmali.apply(translate) tmap_mali = Mali.Mali() tmap_mali.readFromFile(open(options.filename_map_mali, "r")) if tmap_mali.getAlphabet() == "na": tmap_mali.apply(translate) map_old2new = alignlib_lite.py_makeAlignmentVector() mali1 = alignlib_lite.py_makeProfileFromMali(convertMali2Mali(tmali)) if tmap_mali.getLength() == 1: s = tmap_mali.values()[0].mString mali2 = alignlib_lite.py_makeSequence(s) ## see if you can find an identical subsequence and then align to thisD for x in tmali.values(): if s in re.sub("[- .]+", "", x.mString): mali1 = alignlib_lite.py_makeSequence(x.mString) break else: mali2 = alignlib_lite.py_makeProfileFromMali( convertMali2Mali(tmap_mali)) alignator = alignlib_lite.py_makeAlignatorDPFull( alignlib_lite.py_ALIGNMENT_LOCAL, -10.0, -2.0) alignator.align(map_old2new, mali1, mali2) consensus = tmap_mali.getConsensus() if options.loglevel >= 4: options.stdlog.write("# alphabet: %s\n" % tmap_mali.getAlphabet()) options.stdlog.write("# orig : %s\n" % tmali.getConsensus()) options.stdlog.write("# mapped: %s\n" % consensus) options.stdlog.write("# alignment: %s\n" % map_old2new.Write()) else: map_old2new = None for method in options.methods: if method == "summary-numbers": options.stdlog.write( \ """# Numbers of positive sites. # # The consistent row/column contains positive sites that are significant # (above thresholds for probability and omega) for all models/analysis # that have been selected (label: cons). # # The log-likelihood ratio test is performed for model pairs, depending # on the output chosen. # Significance threshold: %6.4f # The pairs are 8 versus 7 and 2 versus 1 and 3 versus 0. # """ % options.significance_threshold ) ## write header if options.prefix: options.stdout.write("prefix\t") options.stdout.write("method\tnseq\t") h = [] for model in options.models: for analysis in options.analysis: h.append("%s%s" % (analysis, model)) h.append("p%s" % (model)) h.append("df%s" % (model)) h.append("chi%s" % (model)) h.append("lrt%s" % (model)) options.stdout.write("\t".join(h)) options.stdout.write("\tcons\tpassed\tfilename\n") nmethod = 0 consistent_cols = [None for x in range(len(options.analysis))] passed_tests = {} for m in options.models: passed_tests[m] = 0 for result in results: row_consistent = None if options.prefix: options.stdout.write("%s" % (options.prefix)) options.stdout.write("%i" % nmethod) options.stdout.write("\t%i" % (result.mNumSequences)) npassed = 0 for model in options.models: sites = result.mSites[model] ## do significance test full_model, null_model = model, map_nested_models[model] lrt = Stats.doLogLikelihoodTest( result.mSites[full_model].mLogLikelihood, result.mSites[full_model].mNumParameters, result.mSites[null_model].mLogLikelihood, result.mSites[null_model].mNumParameters, options.significance_threshold) x = 0 for analysis in options.analysis: if analysis == "neb": s = set( map( extract_f, filter(filter_f, sites.mNEB.mPositiveSites))) elif analysis == "beb": s = set( map( extract_f, filter(filter_f, sites.mBEB.mPositiveSites))) options.stdout.write("\t%i" % (len(s))) if not lrt.mPassed: s = set() if row_consistent == None: row_consistent = s else: row_consistent = row_consistent.intersection(s) if consistent_cols[x] == None: consistent_cols[x] = s else: consistent_cols[x] = consistent_cols[ x].intersection(s) x += 1 if lrt.mPassed: c = "passed" passed_tests[model] += 1 npassed += 1 else: c = "failed" options.stdout.write("\t%5.2e\t%i\t%5.2f\t%s" %\ (lrt.mProbability, lrt.mDegreesFreedom, lrt.mChiSquaredValue, c)) options.stdout.write( "\t%i\t%i\t%s\n" % (len(row_consistent), npassed, headers[nmethod])) nmethod += 1 if options.prefix: options.stdout.write("%s\t" % options.prefix) options.stdout.write("cons") row_consistent = None total_passed = 0 for model in options.models: x = 0 for analysis in options.analysis: s = consistent_cols[x] if s == None: s = set() options.stdout.write("\t%i" % (len(s))) if row_consistent == None: row_consistent = s else: row_consistent = row_consistent.intersection(s) x += 1 options.stdout.write("\tna\t%i" % passed_tests[model]) total_passed += passed_tests[model] options.stdout.write("\t%i\t%i\n" % (len(row_consistent), total_passed)) elif method == "jalview": options.stdout.write("JALVIEW_ANNOTATION\n") options.stdout.write("# Created: %s\n\n" % (time.asctime(time.localtime(time.time())))) l = 1 x = 0 for result in results: sites, significance = selectPositiveSites( [result], options.selection_mode, options, mali) codes = [""] * result.mLength if len(sites) == 0: continue for site in sites: codes[site - 1] = options.jalview_symbol options.stdout.write( "NO_GRAPH\t%s\t%s\n" % (options.jalview_titles[x], "|".join(codes))) x += 1 elif method == "count-positive-sites": sites, significance = selectPositiveSites(results, options.selection_mode, options, mali) options.stdout.write("%i\n" % (len(sites))) elif method in ("positive-site-table", ): sites, significance = selectPositiveSites(results, options.selection_mode, options, mali) headers = ["site", "P"] if map_old2new: headers.append("mapped") headers.append("Pm") options.stdout.write("\t".join(headers) + "\n") sites = list(sites) sites.sort() nmapped, nunmapped = 0, 0 for site in sites: values = [site, "%6.4f" % significance[site]] if map_old2new: r = map_old2new.mapRowToCol(site) if r == 0: values.append("na") values.append("") nunmapped += 1 if options.loglevel >= 2: options.stdlog.write("# unmapped residue: %i\n" % site) else: values.append(r) values.append(consensus[r - 1]) nmapped += 1 options.stdout.write("\t".join(map(str, (values))) + "\n") if options.loglevel >= 1: options.stdlog.write( "# sites: ninput=%i, noutput=%i, nskipped=%i\n" % (len(sites), nmapped, nunmapped)) E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def decorator_median_length(intervals, start, end, contig, fasta): """compute length distribution.""" d = Stats.DistributionalParameters(map(lambda x: x[1] - x[0], intervals)) return d['median'], str(d)
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id: run_nubiscan.py 2861 2010-02-23 17:36:32Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-i", "--iterations", dest="iterations", type="int", help="number of iterations for sampling [default=%default]." ) parser.add_option("-q", "--qvalue", dest="qvalue_threshold", type="float", help="qvalue threshold [default=%default]." ) parser.add_option("--without-combine", dest="combine", action = "store_false", help="combine overlapping motifs [default=%default]." ) parser.add_option("-f", "--fdr-control", dest="fdr_control", type="choice", choices = ("per-sequence", "all", "xall"), help="qvalue threshold [default=%default]." ) parser.add_option("-m", "--motif", dest="motif", type="choice", choices=("rxrvdr", "rxrvdr1", "rxrvdr2", "nr"), help="qvalue threshold [default=%default]." ) parser.add_option("-a", "--arrangements", dest="arrangements", type="string", help ="',' separated list of repeat arrangements [default=%default]") parser.add_option("-x", "--mask", dest="mask", type="choice", choices=("dust","repeatmasker"), help ="mask sequences before scanning [default=%default]") parser.add_option("--output-stats", dest="output_stats", action = "store_true", help="output stats [default=%default]." ) parser.add_option("--add-sequence", dest="add_sequence", action = "store_true", help="add sequence information [default=%default]." ) parser.set_defaults( iterations = 100, qvalue_threshold = 0.05, motif = "rxrvdr", fdr_control = "all", combine = True, arrangements = None, mask = None, output_stats = False, add_sequence = False, ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv, add_output_options = True ) ## do sth ninput, nskipped, noutput = 0, 0, 0 if options.arrangements == None: options.arrangements = [ "DR%s" % x for x in range(0,15) ] + [ "ER%s" % x for x in range(0,15) ] else: options.arrangements = options.arrangements.split(",") options.stdout.write( "%s" % "\t".join(Nubiscan.NubiscanMatch._fields) ) if options.add_sequence: options.stdout.write( "\tsequence" ) options.stdout.write("\n") if options.motif == 'nr': sense_matrix = NR elif options.motif == "rxrvdr": sense_matrix = RXRVDR elif options.motif == "rxrvdr1": sense_matrix = RXRVDR1 elif options.motif == "rxrvdr2": sense_matrix = RXRVDR2 else: raise ValueError("unknown matrix %s" % options.motif) if options.fdr_control == "all": seqs = list(FastaIterator.iterate(options.stdin)) if options.mask: masked_seqs = maskSequences( [x.sequence for x in seqs], options.mask ) else: masked_seqs = [x.sequence for x in seqs] ninput = len(seqs) map_id2title = dict( enumerate( [re.sub("\s.*", "", x.title) for x in seqs] ) ) matcher = Nubiscan.MatcherRandomisationSequences( sense_matrix, samples = options.iterations ) results = matcher.run( masked_seqs, options.arrangements, qvalue_threshold = options.qvalue_threshold ) if options.combine: results = Nubiscan.combineMotifs( results ) for r in results: if r.alternatives: alternatives = ",".join( [x.arrangement for x in r.alternatives ] ) else: alternatives = "" options.stdout.write( "\t".join( ( map_id2title[r.id], "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%6.4e" % r.pvalue, "%6.4e" % r.qvalue, alternatives) ) ) if options.add_sequence: s = masked_seqs[int(r.id)][r.start:r.end] if r.strand == "-": s = Genomics.complement( s ) s = s[:6].upper() + s[6:-6].lower() + s[-6:].upper() options.stdout.write( "\t%s" % s ) options.stdout.write("\n") noutput += 1 # output stats if options.output_stats: outfile = E.openOutputFile( "fdr" ) outfile.write("bin\thist\tnobserved\n" ) for bin, hist, nobs in zip(matcher.bin_edges, matcher.hist, matcher.nobservations): outfile.write( "%f\t%f\t%f\n" % (bin, hist, nobs)) outfile.close() elif options.fdr_control == "xall": matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix, samples = options.iterations ) # collect all results matches = [] for seq in FastaIterator.iterate(options.stdin): ninput += 1 mm = matcher.run( seq.sequence, options.arrangements, qvalue_threshold = None ) for m in mm: matches.append( m._replace( sequence = seq.title ) ) # estimate qvalues for all matches across all sequences pvalues = [ x.pvalue for x in matches ] fdr = Stats.doFDR( pvalues ) qvalues = fdr.mQValues results = [] for m, qvalue in zip(matches, qvalues): if qvalue > options.qvalue_threshold: continue results.append( m._replace( qvalue = qvalue ) ) if options.combine: results = Nubiscan.combineMotifs( results ) # output for r in results: options.stdout.write( "\t".join( ( r.id, "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%6.4e" % r.pvalue, "%6.4e" % r.qvalue ) ) + "\n" ) noutput += 1 elif options.fdr_control == "per-sequence": matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix, samples = options.iterations ) for seq in FastaIterator.iterate(options.stdin): ninput += 1 result = matcher.run( seq.sequence, options.arrangements, qvalue_threshold = options.qvalue_threshold ) if options.combine: result = Nubiscan.combineMotifs( result ) t = re.sub(" .*","", seq.title) for r in result: options.stdout.write( "\t".join( ( t, "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%f" % r.pvalue, "%f" % r.qvalue ) ) + "\n" ) noutput += 1 E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput,nskipped) ) ## write footer and output benchmark information. E.Stop()
for row2 in range( row1+1, len(row_headers) ): pairs.append( (row1, row2) ) elif options.iteration == "all-vs-all": pairs = [] for row1 in range( 0, len(row_headers) ): for row2 in range( 0, len(row_headers) ): if row1 == row2: continue pairs.append( (row1, row2) ) if options.method == "chi-squared": for row1, row2 in pairs: row_header1 = row_headers[row1] row_header2 = row_headers[row2] try: result = Stats.doChiSquaredTest( numpy.vstack( (matrix[row1], matrix[row2] ) ) ) except ValueError: nskipped += 1 continue noutput += 1 options.stdout.write( "\t".join( ( "%s" % row_header1, "%s" % row_header2, "%i" % result.mSampleSize, "%i" % min(matrix.flat), "%i" % max(matrix.flat), options.value_format % result.mChiSquaredValue, "%i" % result.mDegreesFreedom, options.pvalue_format % result.mProbability, "%s" % result.mSignificance, options.value_format % result.mPhi ) ) + "\n" )
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: r_table2scatter.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-c", "--columns", dest="columns", type="string", help="columns to take from table. Choices are 'all', 'all-but-first' or a ','-separated list of columns.") parser.add_option("--logscale", dest="logscale", type="string", help="log-transform one or both axes [default=%Default].") parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file [default=%default].", metavar="FILE") parser.add_option("-f", "--file", dest="input_filename", type="string", help="filename with table data [default=%default].", metavar="FILE") parser.add_option("-2", "--file2", dest="input_filename2", type="string", help="additional data file [default=%default].", metavar="FILE") parser.add_option("-s", "--stats", dest="statistics", type="choice", choices=("correlation", "spearman", "pearson", "count"), help="statistical quantities to compute [default=%default]", action="append") parser.add_option("-p", "--plot", dest="plot", type="choice", choices=("scatter", "pairs", "panel", "bar", "bar-stacked", "bar-besides", "1_vs_x", "matched", "boxplot", "scatter+marginal", "scatter-regression"), help="plots to plot [default=%default]", action="append") parser.add_option("-t", "--threshold", dest="threshold", type="float", help="min threshold to use for counting method [default=%default].") parser.add_option("-o", "--colours", dest="colours", type="int", help="column with colour information [default=%default].") parser.add_option("-l", "--plot-labels", dest="labels", type="string", help="column labels for x and y in matched plots [default=%default].") parser.add_option("-d", "--add-diagonal", dest="add_diagonal", action="store_true", help="add diagonal to plot [default=%default].") parser.add_option("-e", "--plot-legend", dest="legend", type="int", help="column with legend [default=%default].") parser.add_option("-r", "--options", dest="r_options", type="string", help="R plotting options [default=%default].") parser.add_option("--format", dest="format", type="choice", choices=("full", "sparse"), help="output format [default=%default].") parser.add_option("--title", dest="title", type="string", help="""plot title [default=%default].""") parser.add_option("", "--xrange", dest="xrange", type="string", help="x viewing range of plot [default=%default].") parser.add_option("", "--yrange", dest="yrange", type="string", help="y viewing range of plot[default=%default].") parser.add_option("--allow-empty-file", dest="fail_on_empty", action="store_false", help="do not fail on empty input [default=%default].") parser.add_option("--fail-on-empty", dest="fail_on_empty", action="store_true", help="fail on empty input [default=%default].") parser.set_defaults( hardcopy=None, input_filename="", input_filename2=None, columns="all", logscale=None, statistics=[], plot=[], threshold=0.0, labels="x,y", colours=None, diagonal=False, legend=None, title=None, xrange=None, yrange=None, r_options="", fail_on_empty=True, format="full") (options, args) = E.Start(parser) if len(args) == 1 and not options.input_filename: options.input_filename = args[0] if options.columns not in ("all", "all-but-first"): options.columns = [int(x) - 1 for x in options.columns.split(",")] if options.colours: options.colours -= 1 if options.legend: options.legend -= 1 table = {} headers = [] # read data matrix if options.input_filename: lines = IOTools.openFile(options.input_filename, "r").readlines() else: # note: this will not work for interactive viewing, but # creating hardcopy plots works. lines = sys.stdin.readlines() lines = [x for x in lines if x[0] != "#"] if len(lines) == 0: if options.fail_on_empty: raise IOError("no input") E.warn("empty input") E.Stop() return matrix, headers, colours, legend = readTable(lines, "matrix", take_columns=options.columns, headers=True, colours=options.colours, row_names=options.legend) if options.input_filename2: # read another matrix (should be of the same format. matrix2, headers2, colours2, legend2 = readTable( lines, "matrix2", take_columns=options.columns, headers=True, colours=options.colours, row_names=options.legend) R.assign("headers", headers) ndata = R("""length( matrix[,1] )""")[0] if options.loglevel >= 1: options.stdlog.write("# read matrix: %ix%i\n" % (len(headers), ndata)) if colours: R.assign("colours", colours) for method in options.statistics: if method == "correlation": cor = R.cor(matrix, use="pairwise.complete.obs") writeMatrix(sys.stdout, cor, headers=headers, format="%5.2f") elif method == "pearson": options.stdout.write("\t".join(("var1", "var2", "coeff", "passed", "pvalue", "n", "method", "alternative")) + "\n") for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): try: result = R( """cor.test( matrix[,%i], matrix[,%i] )""" % (x + 1, y + 1)) except rpy.RPyException as msg: E.warn("correlation not computed for columns %i(%s) and %i(%s): %s" % ( x, headers[x], y, headers[y], msg)) options.stdout.write("%s\t%s\t%s\t%s\t%s\t%i\t%s\t%s\n" % (headers[x], headers[y], "na", "na", "na", 0, "na", "na")) else: options.stdout.write( "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" % (headers[x], headers[y], result.rx2('estimate').rx2( 'cor')[0], Stats.getSignificance( float(result.rx2('p.value')[0])), result.rx2('p.value')[0], result.rx2('parameter').rx2( 'df')[0], result.rx2('method')[0], result.rx2('alternative')[0])) elif method == "spearman": options.stdout.write("\t".join(("var1", "var2", "coeff", "passed", "pvalue", "method", "alternative")) + "\n") for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): result = R( """cor.test( matrix[,%i], matrix[,%i], method='spearman')""" % (x + 1, y + 1)) options.stdout.write( "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" % (headers[x], headers[y], result['estimate']['rho'], Stats.getSignificance(float(result['p.value'])), result['p.value'], result['parameter']['df'], result['method'], result['alternative'])) elif method == "count": # number of shared elements > threshold m, r, c = MatlabTools.ReadMatrix(open(options.input_filename, "r"), take=options.columns, headers=True) mask = numpy.greater(m, options.threshold) counts = numpy.dot(numpy.transpose(mask), mask) writeMatrix(options.stdout, counts, headers=c, format="%i") if options.plot: # remove columns that are completely empty if "pairs" in options.plot: colsums = R('''colSums( is.na(matrix ))''') take = [x for x in range(len(colsums)) if colsums[x] != ndata] if take: E.warn("removing empty columns %s before plotting" % str(take)) matrix = R.subset(matrix, select=[x + 1 for x in take]) R.assign("""matrix""", matrix) headers = [headers[x] for x in take] if legend: legend = [headers[x] for x in take] if options.r_options: extra_options = ", %s" % options.r_options else: extra_options = "" if options.legend is not None and len(legend): extra_options += ", legend=c('%s')" % "','".join(legend) if options.labels: xlabel, ylabel = options.labels.split(",") extra_options += ", xlab='%s', ylab='%s'" % (xlabel, ylabel) else: xlabel, ylabel = "", "" if options.colours: extra_options += ", col=colours" if options.logscale: extra_options += ", log='%s'" % options.logscale if options.xrange: extra_options += ", xlim=c(%f,%f)" % tuple( map(float, options.xrange.split(","))) if options.yrange: extra_options += ", ylim=c(%f,%f)" % tuple( map(float, options.yrange.split(","))) if options.hardcopy: if options.hardcopy.endswith(".eps"): R.postscript(options.hardcopy) elif options.hardcopy.endswith(".png"): R.png(options.hardcopy, width=1024, height=768, type="cairo") elif options.hardcopy.endswith(".jpg"): R.jpg(options.hardcopy, width=1024, height=768, type="cairo") for method in options.plot: if ndata < 100: point_size = "1" pch = "o" elif ndata < 1000: point_size = "1" pch = "o" else: point_size = "0.5" pch = "." if method == "scatter": R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % ( point_size, extra_options)) if method == "scatter-regression": R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % ( point_size, extra_options)) dat = R( """dat <- data.frame(x = matrix[,1], y = matrix[,2])""") R( """new <- data.frame(x = seq( min(matrix[,1]), max(matrix[,1]), (max(matrix[,1]) - min(matrix[,1])) / 100))""") mod = R("""mod <- lm( y ~ x, dat)""") R("""predict(mod, new, se.fit = TRUE)""") R("""pred.w.plim <- predict(mod, new, interval="prediction")""") R("""pred.w.clim <- predict(mod, new, interval="confidence")""") R( """matpoints(new$x,cbind(pred.w.clim, pred.w.plim[,-1]), lty=c(1,2,2,3,3), type="l")""") R.mtext( "y = %f * x + %f, r=%6.4f, n=%i" % (mod["coefficients"]["x"], mod["coefficients"][ "(Intercept)"], R("""cor( dat )[2]"""), ndata), 3, cex=1.0) elif method == "pairs": if options.add_diagonal: R( """panel.hist <- function( x,y,... ) { points(x,y,...); abline(0,1); }""") else: R( """panel.hist <- function( x,y,... ) { points(x,y,...); }""") # There used to be a argument na_action="na.omit", but # removed this as there appeared error messages saying # "na.action is not a graphical parameter" and the # plots showed occasionally the wrong scale. # cex=point_size also caused trouble (error message: # "X11 used font size 8 when 2 was requested" or # similar) if options.colours: R.pairs(matrix, pch=pch, col=colours, main=options.title, panel="panel.hist", labels=headers, cex_labels=2.0) else: R.pairs(matrix, pch=pch, panel="panel.hist", main=options.title, labels=headers, cex_labels=2.0) elif method == "boxplot": extra_options += ",main='%s'" % options.title # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R( """op <- par(mar=c(11,4,4,2))""") # the 10 allows the names.arg below the barplot R("""boxplot( matrix %s)""" % extra_options) elif method == "bar" or method == "bar-stacked": if not options.colours: extra_options += ", col=rainbow(5)" # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R( """op <- par(mar=c(11,4,4,2))""") # the 10 allows the names.arg below the barplot R("""barplot(as.matrix(matrix), %s)""" % extra_options) elif method == "bar-besides": if not options.colours: extra_options += ", col=rainbow(%i)" % ndata # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R( """op <- par(mar=c(11,4,4,2))""") # the 10 allows the names.arg below the barplot R("""barplot(as.matrix(matrix), beside=TRUE %s)""" % extra_options) elif method == "scatter+marginal": if options.title: # set the size of the outer margins - the title needs to be added at the end # after plots have been created R.par(oma=R.c(0, 0, 4, 0)) R("""matrix""") R(""" x <- matrix[,1]; y <- matrix[,2]; xhist <- hist(x, breaks=20, plot=FALSE); yhist <- hist(y, breaks=20, plot=FALSE); top <- max(c(xhist$counts, yhist$counts)); nf <- layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), c(3,1), c(1,3), respect=TRUE ); par(mar=c(3,3,1,1)) ; plot(x, y, cex=%s, pch="o" %s) ; par(mar=c(0,3,1,1)) ; barplot(xhist$counts, axes=FALSE, ylim=c(0, top), space=0 ) ; par(mar=c(3,0,1,1)) ; title(main='%s'); barplot(yhist$counts, axes=FALSE, xlim=c(0, top), space=0, horiz=TRUE ) ; title(main='%s'); """ % (point_size, extra_options, xlabel, ylabel)) if options.title: R.mtext(options.title, 3, outer=True, line=1, cex=1.5) elif method in ("panel", "1_vs_x", "matched"): if method == "panel": pairs = [] for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): pairs.append((x, y)) elif method == "1_vs_x": pairs = [] for x in range(1, len(headers)): pairs.append((0, x)) # print matching columns elif method == "matched": pairs = [] for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): if headers[x] == headers[y]: pairs.append((x, y)) break w = int(math.ceil(math.sqrt(len(pairs)))) h = int(math.ceil(float(len(pairs)) / w)) PosInf = 1e300000 NegInf = -1e300000 xlabel, ylabel = options.labels.split(",") R("""layout(matrix(seq(1,%i), %i, %i, byrow = TRUE))""" % (w * h, w, h)) for a, b in pairs: new_matrix = [x for x in zip( list(matrix[a].values())[0], list(matrix[b].values())[0]) if x[0] not in (float("nan"), PosInf, NegInf) and x[1] not in (float("nan"), PosInf, NegInf)] try: R("""plot(matrix[,%i], matrix[,%i], main='%s versus %s', cex=0.5, pch=".", xlab='%s', ylab='%s' )""" % ( a + 1, b + 1, headers[b], headers[a], xlabel, ylabel)) except rpy.RException as msg: print("could not plot %s versus %s: %s" % (headers[b], headers[a], msg)) if options.hardcopy: R['dev.off']() E.info("matrix added as >matrix< in R.") if not options.hardcopy: if options.input_filename: interpreter = code.InteractiveConsole(globals()) interpreter.interact() else: E.info( "can not start new interactive session as input has come from stdin.") E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id: matrix2stats.py 2795 2009-09-16 15:29:23Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=("chi-squared", "pearson-chi-squared"), help="statistical methods to apply.") parser.add_option("-t", "--header-names", dest="headers", action="store_true", help="matrix has row/column headers.") parser.add_option("--no-headers", dest="headers", action="store_false", help="matrix has no row/column headers.") parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("full", "sparse", "phylip"), help="""input format for matrix.""" ) parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("full", "sparse", "phylip"), help="""output format for matrix.""" ) parser.add_option("-p", "--parameters", dest="parameters", action="append", type="string", help="parameters for various functions.") parser.add_option("-a", "--iteration", dest="iteration", type="choice", choices=("pairwise", "all-vs-all"), help="""how to compute stats [%default].""" ) parser.set_defaults( method="chi-squared", headers=True, value_format="%6.4f", pvalue_format="%6.4e", input_format="full", write_separators=True, parameters=[], iteration=None, ) (options, args) = E.Start(parser) lines = [x for x in sys.stdin.readlines() if x[0] != "#"] chunks = [x for x in range(len(lines)) if lines[x][0] == ">"] if not chunks: options.write_separators = False chunks = [-1] chunks.append(len(lines)) ninput, noutput, nskipped = 0, 0, 0 if options.write_separators: options.stdout.write("test\t") header_prefix = "" if options.method == "chi-squared": header_prefix = "observed\texpected" options.stdout.write("\t".join( (header_prefix, "n", "min", "max", "chi", "df", "P", "passed", "phi")) + "\n") elif options.method in ("pearson-chi-squared",): options.stdout.write("column\t") options.stdout.write("\t".join( (header_prefix, "n", "prob", "obs", "exp", "chi", "df", "P", "passed", "phi")) + "\n") if len(options.parameters) == 0: raise "out of parameters - please supply probability or filename with probabilities." param = options.parameters[0] del options.parameters[0] if options.write_separators: probabilities = IOTools.ReadMap( IOTools.openFile(param, "r"), map_functions=(str, float)) else: probability = float(param) for x in range(len(chunks) - 1): ninput += 1 matrix, row_headers, col_headers = MatlabTools.readMatrix( StringIO("".join(lines[chunks[x] + 1:chunks[x + 1]])), format=options.input_format, headers=options.headers) nrows, ncols = matrix.shape if options.loglevel >= 2: options.stdlog.write("# read matrix: %i x %i, %i row titles, %i colum titles.\n" % (nrows, ncols, len(row_headers), len(col_headers))) if options.write_separators: options.stdout.write(lines[chunks[x]][1:-1] + "\t") pairs = [] if options.iteration == "pairwise": pairs = [] for row1 in range(0, len(row_headers)): for row2 in range(row1 + 1, len(row_headers)): pairs.append((row1, row2)) elif options.iteration == "all-vs-all": pairs = [] for row1 in range(0, len(row_headers)): for row2 in range(0, len(row_headers)): if row1 == row2: continue pairs.append((row1, row2)) if options.method == "chi-squared": for row1, row2 in pairs: row_header1 = row_headers[row1] row_header2 = row_headers[row2] try: result = Stats.doChiSquaredTest( numpy.vstack((matrix[row1], matrix[row2]))) except ValueError: nskipped += 1 continue noutput += 1 options.stdout.write("\t".join(( "%s" % row_header1, "%s" % row_header2, "%i" % result.mSampleSize, "%i" % min(matrix.flat), "%i" % max(matrix.flat), options.value_format % result.mChiSquaredValue, "%i" % result.mDegreesFreedom, options.pvalue_format % result.mProbability, "%s" % result.mSignificance, options.value_format % result.mPhi)) + "\n") elif options.method == "pearson-chi-squared": if nrows != 2: raise ValueError("only implemented for 2xn table") if options.write_separators: id = re.match("(\S+)", lines[chunks[x]][1:-1]).groups()[0] probability = probabilities[id] for col in range(ncols): options.stdout.write("%s\t" % col_headers[col]) result = Stats.doPearsonChiSquaredTest( probability, sum(matrix[:, col]), matrix[0, col]) options.stdout.write("\t".join(( "%i" % result.mSampleSize, "%f" % probability, "%i" % result.mObserved, "%f" % result.mExpected, options.value_format % result.mChiSquaredValue, "%i" % result.mDegreesFreedom, options.pvalue_format % result.mProbability, "%s" % result.mSignificance, options.value_format % result.mPhi))) if col < ncols - 1: options.stdout.write("\n") if options.write_separators: options.stdout.write(lines[chunks[x]][1:-1] + "\t") options.stdout.write("\n") E.info("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
def decorator_stddev_score(values, start, end, contig): """compute stddev of values.""" d = Stats.DistributionalParameters(values) return d['stddev'], str(d)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The default behaviour of the script is to guess the quality " "format of the input fastq file. The user can specify the " "quality format of the input file using the --guess-format option. " "The script will use this format if the " "sequence qualities are ambiguous.[default=%default].") parser.add_option( "--target-format", dest="target_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The script will convert quality scores to the destination " "format unless [default=%default].") parser.set_defaults( target_format=None, guess_format=None, min_quality=10, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) c = E.Counter() if options.target_format: iterator = Fastq.iterate_convert(options.stdin, format=options.target_format, guess=options.guess_format) else: iterator = Fastq.iterate_guess(options.stdin, guess=options.guess_format) options.stdout.write("read\tnfailed\tnN\t%s\n" % ("\t".join(Stats.Summary().getHeaders()))) min_quality = options.min_quality for record in iterator: c.input += 1 quals = record.toPhred() nfailed = len([x for x in quals if x < min_quality]) nns = record.seq.count("N") + record.seq.count(".") options.stdout.write( "%s\t%i\t%i\t%s\n" % (record.identifier, nfailed, nns, str(Stats.Summary(quals)))) c.output += 1 # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def decorator_percent_coverage(intervals, start, end, contig, fasta): """compute length of intervals.""" d = Stats.DistributionalParameters([x[1] - x[0] for x in intervals]) return 100.0 * float(d['sum']) / (end - start), str(d)
def loadGOs(infiles, outfile, tablename): '''import GO results into a single table. This method also computes a global QValue over all tracks, genesets and annotation sets. Arguments --------- infiles : string Output files of several runGO analyses outfile : string Output filename, contains log information tablename : string Table name for storing results. ''' header = False tempf1 = P.getTempFile() pvalues = [] for infile in infiles: indir = infile + ".dir" if not os.path.exists(indir): continue track, geneset, annotationset = re.search( "^(\S+)_vs_(\S+)\.(\S+)", infile).groups() for filename in glob.glob(os.path.join(indir, "*.overall")): for line in open(filename, "r"): if line.startswith("#"): continue data = line[:-1].split("\t") if line.startswith("code"): if header: continue tempf1.write("track\tgeneset\tannotationset\t%s" % line) header = True assert data[10] == "pover" and data[ 11] == "punder", "format error, expected pover-punder, got %s-%s" % (data[10], data[11]) continue tempf1.write("%s\t%s\t%s\t%s" % (track, geneset, annotationset, line)) pvalues.append(min(float(data[10]), float(data[11]))) tempf1.close() E.info("analysing %i pvalues" % len(pvalues)) fdr = Stats.doFDR(pvalues) E.info("got %i qvalues" % len(fdr.mQValues)) qvalues = ["global_qvalue"] + fdr.mQValues tempf2 = P.getTempFile() for line, qvalue in zip(open(tempf1.name, "r"), qvalues): tempf2.write("%s\t%s\n" % (line[:-1], str(qvalue))) tempf2.close() P.load(tempf2.name, outfile, tablename=tablename, options="--allow-empty-file " "--add-index=category " "--add-index=track,geneset,annotationset " "--add-index=geneset " "--add-index=annotationset " "--add-index=goid ") os.unlink(tempf1.name) os.unlink(tempf2.name)
def decorator_median_score(values, start, end, contig): """compute median of values.""" d = Stats.DistributionalParameters(values) return d['median'], str(d)
def loadGOs( infiles, outfile, tablename ): '''import GO results into a single table. This method also computes a global QValue over all tracks, genesets and annotation sets. ''' header = False tempf1 = P.getTempFile() pvalues = [] for infile in infiles: indir = infile + ".dir" if not os.path.exists( indir ): continue track, geneset, annotationset = re.search("^(\S+)_vs_(\S+)\.(\S+)", infile ).groups() for filename in glob.glob( os.path.join(indir, "*.overall") ): for line in open(filename, "r" ): if line.startswith("#"): continue data = line[:-1].split("\t") if line.startswith("code"): if header: continue tempf1.write( "track\tgeneset\tannotationset\t%s" % line ) header = True assert data[10] == "pover" and data[11] == "punder", "format error, expected pover-punder, got %s-%s" % (data[10], data[11]) continue tempf1.write( "%s\t%s\t%s\t%s" % (track, geneset, annotationset, line) ) pvalues.append( min( float(data[10]), float(data[11]) ) ) tempf1.close() E.info( "analysing %i pvalues" % len(pvalues )) fdr = Stats.doFDR( pvalues ) E.info( "got %i qvalues" % len(fdr.mQValues )) qvalues = ["global_qvalue" ] + fdr.mQValues tempf2 = P.getTempFile() for line, qvalue in zip( open(tempf1.name,"r"), qvalues ): tempf2.write( "%s\t%s\n" % (line[:-1], str(qvalue)) ) tempf2.close() tempfilename = tempf2.name print tempf1.name print tempf2.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty --index=category --index=track,geneset,annotationset --index=geneset --index=annotationset --index=goid --table=%(tablename)s < %(tempfilename)s > %(outfile)s ''' P.run()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: codemls2tsv.py 2781 2009-09-10 11:33:14Z andreas $") parser.add_option("--methods", dest="methods", type="choice", action="append", choices=("summary-numbers", "jalview", "positive-site-table", "positive-site-list", "count-positive-sites"), help="methods for analysis.") parser.add_option("--selection-mode", dest="selection_mode", type="choice", choices=("all", "consistent", "emes"), help="how to select positive sites.") parser.add_option("--prefix", dest="prefix", type="string", help="prefix for rows.") parser.add_option("--pattern-input-filenames", dest="pattern_input_filenames", type="string", help="input pattern.") parser.add_option("--filter-probability", dest="filter_probability", type="float", help="threshold for probability above which to include positive sites [default=%default].") parser.add_option("--filter-omega", dest="filter_omega", type="float", help="threshold for omega above which to include positive sites [default=%default].") parser.add_option("--models", dest="models", type="string", help="restrict output to set of site specific models.") parser.add_option("--analysis", dest="analysis", type="string", help="restrict output to set of analysis [beb|neb].") parser.add_option("--significance-threshold", dest="significance_threshold", type="float", help="significance threshold for log-likelihood test.") parser.add_option("--filter-mali", dest="filter_mali", type="choice", choices=("none", "gaps"), help="filter by mali to remove gapped positions.") parser.add_option("--filename-mali", dest="filename_mali", type="string", help="filename with multiple alignment used for calculating sites - used for filtering") parser.add_option("--filename-map-mali", dest="filename_map_mali", type="string", help="filename with multiple alignment to map sites onto.") parser.add_option("--jalview-titles", dest="jalview_titles", type="string", help="comma separated list of jalview annotation titles.") parser.add_option("--jalview-symbol", dest="jalview_symbol", type="string", help="symbol to use in jalview.") parser.set_defaults( methods=[], prefix=None, filter_probability=0, filter_omega=0, models="", analysis="", significance_threshold=0.05, selection_mode="consistent", filename_mali=None, filename_map_mali=None, jalview_symbol="*", jalview_titles="", filter_mali=None, ) (options, args) = E.Start(parser) if options.jalview_titles: options.jalview_titles = options.jalview_titles.split(",") else: options.jalview_titles = args options.models = options.models.split(",") options.analysis = options.analysis.split(",") for a in options.analysis: if a not in ("beb", "neb"): raise "unknown analysis section: '%s', possible values are 'beb' and/or 'neb'" % a for a in options.models: if a not in ("8", "2", "3"): raise "unknown model: '%s', possible values are 2, 3, 8" % a codeml = WrapperCodeML.CodeMLSites() # filter and extract functions filter_f = lambda x: x.mProbability >= options.filter_probability and x.mOmega >= options.filter_omega extract_f = lambda x: x.mResidue # read multiple results results = [] ninput, noutput, nskipped = 0, 0, 0 headers = [] for f in args: ninput += 1 try: results.append(codeml.parseOutput(open(f, "r").readlines())) except WrapperCodeML.UsageError: if options.loglevel >= 1: options.stdlog.write("# no input from %s\n" % f) nskipped += 1 continue noutput += 1 headers.append(f) # map of nested model (key) to more general model map_nested_models = {'8': '7', '2': '1', '3': '0'} if options.filename_mali: mali = Mali.Mali() mali.readFromFile(open(options.filename_mali, "r")) else: mali = None ############################################################### ############################################################### ############################################################### # use multiple alignment to map residues to a reference mali # or a sequence. ############################################################### if options.filename_map_mali: if not mali: raise "please supply the input multiple alignment, if residues are to be mapped." # translate the alignments def translate(s): sequence = s.mString seq = [] for codon in [sequence[x:x + 3] for x in range(0, len(sequence), 3)]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) s.mString = "".join(seq) tmali = Mali.Mali() tmali.readFromFile(open(options.filename_mali, "r")) tmali.apply(translate) tmap_mali = Mali.Mali() tmap_mali.readFromFile(open(options.filename_map_mali, "r")) if tmap_mali.getAlphabet() == "na": tmap_mali.apply(translate) map_old2new = alignlib_lite.py_makeAlignmentVector() mali1 = alignlib_lite.py_makeProfileFromMali(convertMali2Mali(tmali)) if tmap_mali.getLength() == 1: s = tmap_mali.values()[0].mString mali2 = alignlib_lite.py_makeSequence(s) # see if you can find an identical subsequence and then align to # thisD for x in tmali.values(): if s in re.sub("[- .]+", "", x.mString): mali1 = alignlib_lite.py_makeSequence(x.mString) break else: mali2 = alignlib_lite.py_makeProfileFromMali( convertMali2Mali(tmap_mali)) alignator = alignlib_lite.py_makeAlignatorDPFull( alignlib_lite.py_ALIGNMENT_LOCAL, -10.0, -2.0) alignator.align(map_old2new, mali1, mali2) consensus = tmap_mali.getConsensus() if options.loglevel >= 4: options.stdlog.write("# alphabet: %s\n" % tmap_mali.getAlphabet()) options.stdlog.write("# orig : %s\n" % tmali.getConsensus()) options.stdlog.write("# mapped: %s\n" % consensus) options.stdlog.write("# alignment: %s\n" % map_old2new.Write()) else: map_old2new = None for method in options.methods: if method == "summary-numbers": options.stdlog.write( """# Numbers of positive sites. # # The consistent row/column contains positive sites that are significant # (above thresholds for probability and omega) for all models/analysis # that have been selected (label: cons). # # The log-likelihood ratio test is performed for model pairs, depending # on the output chosen. # Significance threshold: %6.4f # The pairs are 8 versus 7 and 2 versus 1 and 3 versus 0. # """ % options.significance_threshold ) # write header if options.prefix: options.stdout.write("prefix\t") options.stdout.write("method\tnseq\t") h = [] for model in options.models: for analysis in options.analysis: h.append("%s%s" % (analysis, model)) h.append("p%s" % (model)) h.append("df%s" % (model)) h.append("chi%s" % (model)) h.append("lrt%s" % (model)) options.stdout.write("\t".join(h)) options.stdout.write("\tcons\tpassed\tfilename\n") nmethod = 0 consistent_cols = [None for x in range(len(options.analysis))] passed_tests = {} for m in options.models: passed_tests[m] = 0 for result in results: row_consistent = None if options.prefix: options.stdout.write("%s" % (options.prefix)) options.stdout.write("%i" % nmethod) options.stdout.write("\t%i" % (result.mNumSequences)) npassed = 0 for model in options.models: sites = result.mSites[model] # do significance test full_model, null_model = model, map_nested_models[model] lrt = Stats.doLogLikelihoodTest( result.mSites[full_model].mLogLikelihood, result.mSites[full_model].mNumParameters, result.mSites[null_model].mLogLikelihood, result.mSites[null_model].mNumParameters, options.significance_threshold) x = 0 for analysis in options.analysis: if analysis == "neb": s = set( map(extract_f, filter(filter_f, sites.mNEB.mPositiveSites))) elif analysis == "beb": s = set( map(extract_f, filter(filter_f, sites.mBEB.mPositiveSites))) options.stdout.write("\t%i" % (len(s))) if not lrt.mPassed: s = set() if row_consistent is None: row_consistent = s else: row_consistent = row_consistent.intersection(s) if consistent_cols[x] is None: consistent_cols[x] = s else: consistent_cols[x] = consistent_cols[ x].intersection(s) x += 1 if lrt.mPassed: c = "passed" passed_tests[model] += 1 npassed += 1 else: c = "failed" options.stdout.write("\t%5.2e\t%i\t%5.2f\t%s" % (lrt.mProbability, lrt.mDegreesFreedom, lrt.mChiSquaredValue, c)) options.stdout.write( "\t%i\t%i\t%s\n" % (len(row_consistent), npassed, headers[nmethod])) nmethod += 1 if options.prefix: options.stdout.write("%s\t" % options.prefix) options.stdout.write("cons") row_consistent = None total_passed = 0 for model in options.models: x = 0 for analysis in options.analysis: s = consistent_cols[x] if s is None: s = set() options.stdout.write("\t%i" % (len(s))) if row_consistent is None: row_consistent = s else: row_consistent = row_consistent.intersection(s) x += 1 options.stdout.write("\tna\t%i" % passed_tests[model]) total_passed += passed_tests[model] options.stdout.write( "\t%i\t%i\n" % (len(row_consistent), total_passed)) elif method == "jalview": options.stdout.write("JALVIEW_ANNOTATION\n") options.stdout.write("# Created: %s\n\n" % (time.asctime(time.localtime(time.time())))) l = 1 x = 0 for result in results: sites, significance = selectPositiveSites( [result], options.selection_mode, options, mali) codes = [""] * result.mLength if len(sites) == 0: continue for site in sites: codes[site - 1] = options.jalview_symbol options.stdout.write( "NO_GRAPH\t%s\t%s\n" % (options.jalview_titles[x], "|".join(codes))) x += 1 elif method == "count-positive-sites": sites, significance = selectPositiveSites( results, options.selection_mode, options, mali) options.stdout.write("%i\n" % (len(sites))) elif method in ("positive-site-table", ): sites, significance = selectPositiveSites( results, options.selection_mode, options, mali) headers = ["site", "P"] if map_old2new: headers.append("mapped") headers.append("Pm") options.stdout.write("\t".join(headers) + "\n") sites = list(sites) sites.sort() nmapped, nunmapped = 0, 0 for site in sites: values = [site, "%6.4f" % significance[site]] if map_old2new: r = map_old2new.mapRowToCol(site) if r == 0: values.append("na") values.append("") nunmapped += 1 if options.loglevel >= 2: options.stdlog.write( "# unmapped residue: %i\n" % site) else: values.append(r) values.append(consensus[r - 1]) nmapped += 1 options.stdout.write("\t".join(map(str, (values))) + "\n") if options.loglevel >= 1: options.stdlog.write("# sites: ninput=%i, noutput=%i, nskipped=%i\n" % ( len(sites), nmapped, nunmapped)) E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def main( argv = None ): if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: r_table2scatter.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-c", "--columns", dest="columns", type="string", help="columns to take from table. Choices are 'all', 'all-but-first' or a ','-separated list of columns." ) parser.add_option( "--logscale", dest="logscale", type="string", help="log-transform one or both axes [default=%Default]." ) parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file [default=%default].", metavar = "FILE" ) parser.add_option("-f", "--file", dest="input_filename", type="string", help="filename with table data [default=%default].", metavar = "FILE") parser.add_option("-2", "--file2", dest="input_filename2", type="string", help="additional data file [default=%default].", metavar = "FILE") parser.add_option("-s", "--stats", dest="statistics", type="choice", choices=("correlation", "spearman", "pearson", "count"), help="statistical quantities to compute [default=%default]", action = "append") parser.add_option("-p", "--plot", dest="plot", type="choice", choices=("scatter", "pairs", "panel", "bar", "bar-stacked", "bar-besides", "1_vs_x", "matched", "boxplot", "scatter+marginal", "scatter-regression" ), help="plots to plot [default=%default]", action = "append") parser.add_option("-t", "--threshold", dest="threshold", type="float", help="min threshold to use for counting method [default=%default].") parser.add_option("-o", "--colours", dest="colours", type="int", help="column with colour information [default=%default].") parser.add_option("-l", "--labels", dest="labels", type="string", help="column labels for x and y in matched plots [default=%default].") parser.add_option("-d", "--add-diagonal", dest="add_diagonal", action="store_true", help="add diagonal to plot [default=%default].") parser.add_option("-e", "--legend", dest="legend", type="int", help="column with legend [default=%default].") parser.add_option("-r", "--options", dest="r_options", type="string", help="R plotting options [default=%default].") parser.add_option("--format", dest="format", type="choice", choices=("full", "sparse"), help="output format [default=%default]." ) parser.add_option( "--title", dest="title", type="string", help="""plot title [default=%default].""") parser.add_option("", "--xrange", dest="xrange", type="string", help="x viewing range of plot [default=%default]." ) parser.add_option("", "--yrange", dest="yrange", type="string", help="y viewing range of plot[default=%default]." ) parser.add_option( "--allow-empty", dest="fail_on_empty", action="store_false", help="do not fail on empty input [default=%default].") parser.add_option( "--fail-on-empty", dest="fail_on_empty", action="store_true", help="fail on empty input [default=%default].") parser.set_defaults( \ hardcopy = None, input_filename = "", input_filename2 = None, columns = "all", logscale = None, statistics = [], plot=[], threshold=0.0, labels = "x,y", colours= None, diagonal = False, legend = None, title = None, xrange = None, yrange = None, r_options = "", fail_on_empty = True, format = "full") (options, args) = E.Start( parser ) if len(args) == 1 and not options.input_filename: options.input_filename = args[0] if options.columns not in ("all", "all-but-first"): options.columns = map( lambda x: int(x)-1, options.columns.split(",")) if options.colours: options.colours -= 1 if options.legend: options.legend -= 1 table ={} headers = [] ## read data matrix if options.input_filename: lines = open(options.input_filename, "r").readlines() else: ## note: this will not work for interactive viewing, but ## creating hardcopy plots works. lines = sys.stdin.readlines() lines = filter( lambda x: x[0] != "#", lines) if len(lines) == 0: if options.fail_on_empty: raise IOError ( "no input" ) E.warn( "empty input" ) E.Stop() return matrix, headers, colours, legend = readTable( lines, "matrix", take_columns = options.columns, headers=True, colours=options.colours, row_names = options.legend ) if options.input_filename2: ## read another matrix (should be of the same format. matrix2, headers2, colours2, legend2 = readTable( lines, "matrix2", take_columns = options.columns, headers=True, colours=options.colours, row_names = options.legend ) R.assign("headers", headers) ndata = R( """length( matrix[,1] )""" )[0] if options.loglevel >=1: options.stdlog.write("# read matrix: %ix%i\n" % (len(headers),ndata) ) if colours: R.assign("colours", colours) for method in options.statistics: if method == "correlation": cor = R.cor(matrix, use="pairwise.complete.obs" ) writeMatrix( sys.stdout, cor, headers=headers, format = "%5.2f" ) elif method == "pearson": options.stdout.write( "\t".join( ("var1", "var2", "coeff", "passed", "pvalue", "n", "method", "alternative" )) + "\n" ) for x in range(len(headers)-1): for y in range( x+1, len(headers)): try: result = R("""cor.test( matrix[,%i], matrix[,%i] )""" % (x + 1, y + 1)) except rpy.RPyException, msg: E.warn( "correlation not computed for columns %i(%s) and %i(%s): %s" % (x, headers[x], y, headers[y], msg) ) options.stdout.write( "%s\t%s\t%s\t%s\t%s\t%i\t%s\t%s\n" % \ (headers[x], headers[y], "na", "na", "na", 0, "na", "na" )) else: options.stdout.write( "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" % \ (headers[x], headers[y], result.rx2('estimate').rx2('cor')[0], Stats.getSignificance( float(result.rx2('p.value')[0]) ), result.rx2('p.value')[0], result.rx2('parameter').rx2('df')[0], result.rx2('method')[0], result.rx2('alternative')[0]) ) elif method == "spearman": options.stdout.write( "\t".join( ("var1", "var2", "coeff", "passed", "pvalue", "method", "alternative" )) + "\n" ) for x in range(len(headers)-1): for y in range( x+1, len(headers)): result = R("""cor.test( matrix[,%i], matrix[,%i], method='spearman' )""" % (x + 1, y + 1)) options.stdout.write( "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" % \ (headers[x], headers[y], result['estimate']['rho'], Stats.getSignificance( float(result['p.value']) ), result['p.value'], result['parameter']['df'], result['method'], result['alternative']))