def writeResults(outfile, results): fields = ("wall", "user", "sys", "cuser", "csys", "nchunks") outfile.write("host\t%s\n" % "\t".join([ "%s_%s" % (x, y) for x, y in itertools.product(fields, Stats.Summary().getHeaders()) ])) hosts = results.keys() hosts.sort() for host in hosts: result = results[host] outfile.write("%s" % host) for f in fields: d = [y.__getitem__(f) for y in result] outfile.write("\t%s" % Stats.Summary(d)) outfile.write("\n")
def __str__(self): single_exon_transcripts = 0 exons_per_transcript = [] intron_sizes = [] transcript_lengths = [] exon_sizes = [] for x in self.counts_exons_per_transcript.values(): x.sort() x = Intervals.combine(x) transcript_lengths.append(x[-1][1] - x[0][0]) exons_per_transcript.append(len(x)) for start, end in x: exon_sizes.append(end - start) if len(x) == 1: single_exon_transcripts += 1 continue last_end = x[0][1] for start, end in x[1:]: intron_sizes.append(start - last_end) last_end = end return "\t".join( map(str, ( len(self.counts_gene_ids), len(self.counts_transcript_ids), single_exon_transcripts, Stats.Summary(exons_per_transcript), Stats.Summary(exon_sizes), Stats.Summary(intron_sizes), Stats.Summary(transcript_lengths), )))
def process(self, contig, start, end, reads, qualities): entry = GTF.Entry() entry.start, entry.end = start, end entry.gene_id = self.mIdFormat % id entry.transcript_id = entry.gene_id entry.contig = contig entry.feature = "exon" entry.source = "maq" read_stats = Stats.Summary(reads) entry.score = "%5.2f" % read_stats['mean'] self.mOutFile.write(str(entry) + "\n")
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The default behaviour of the script is to guess the quality " "format of the input fastq file. The user can specify the " "quality format of the input file using the --guess-format option. " "The script will use this format if the " "sequence qualities are ambiguous.[default=%default].") parser.add_option( "--target-format", dest="target_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The script will convert quality scores to the destination " "format unless [default=%default].") parser.set_defaults( target_format=None, guess_format=None, min_quality=10, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) c = E.Counter() if options.target_format: iterator = Fastq.iterate_convert(options.stdin, format=options.target_format, guess=options.guess_format) else: iterator = Fastq.iterate_guess(options.stdin, guess=options.guess_format) options.stdout.write("read\tnfailed\tnN\t%s\n" % ("\t".join(Stats.Summary().getHeaders()))) min_quality = options.min_quality for record in iterator: c.input += 1 quals = record.toPhred() nfailed = len([x for x in quals if x < min_quality]) nns = record.seq.count("N") + record.seq.count(".") options.stdout.write( "%s\t%i\t%i\t%s\n" % (record.identifier, nfailed, nns, str(Stats.Summary(quals)))) c.output += 1 # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: r_test.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-m", "--method", dest="method", type="choice", help="method to use [t-test=t-test,wilcox=wilcox]", choices=("t-test", "wilcox")) parser.add_option("-1", "--infile", dest="filename_input", type="string", help="input filename with vector of values.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename with vector of values.") parser.add_option("--header", dest="header", type="string", help="""header of value column [default=%default].""") parser.set_defaults( method="t-test", filename_input=None, header="value", ) (options, args) = E.Start(parser, add_pipe_options=True) if options.filename_input: infile = open(options.filename_input, "r") else: infile = sys.stdin values, errors = IOTools.ReadList(infile, map_function=float) if options.filename_input: infile.close() if errors: E.warn("errors in input: %s" % ";".join(map(str, errors))) kwargs = {} xargs = [] for arg in args: if "=" in arg: key, value = arg.split("=") kwargs[key] = value else: xargs.append(arg) if options.filename_input2: infile = open(options.filename_input2, "r") values2, errors2 = IOTools.ReadList(infile, map_function=float) infile.close() else: values2 = None stat = Stats.Summary(values) power, diff_at_power95 = None, None if options.method == "t-test": if values2: result = R.t_test(values, values2, *xargs, **kwargs) else: result = R.t_test(values, *xargs, **kwargs) # compute power of test power = R.power_t_test(n=len(values), delta=abs(stat["mean"]), sd=stat["stddev"], sig_level=0.05)['power'] diff_at_power95 = R.power_t_test(n=len(values), power=0.95, sd=stat["stddev"], sig_level=0.05)['delta'] if options.method == "wilcox": result = R.wilcox_test(values, *xargs, **kwargs) options.stdout.write("%s\t%s\n" % ("key", options.header)) for key, value in sorted(result.items()): if key == "data.name": continue if key == "p.value": options.stdout.write("%s\t%5.2e\n" % (str(key), value)) else: options.stdout.write("%s\t%s\n" % (str(key), str(value))) for key, value in stat.items(): options.stdout.write("%s\t%s\n" % (str(key), str(value))) if power: options.stdout.write("1-power\t%5.2e\n" % (1.0 - power)) options.stdout.write("diff_at_power95\t%f\n" % diff_at_power95) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: r_compare_distributions.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option( "-m", "--method", dest="method", type="choice", help= "method to use: ks=Kolmogorov-Smirnov, mwu=Mann-WhitneyU, shapiro=Shapiro-Wilk, paired-mwu=paired Mann-WhitneyU, paired-t=paired t-test [default=%default]", choices=("ks", "mwu", "shapiro", "paired-mwu", "paired-t")) parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file.", metavar="FILE") parser.add_option("-1", "--infile1", dest="filename_input1", type="string", help="input filename for distribution 1.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename for distribution 2.") parser.add_option("--plot-legend", dest="legend", type="string", help="legend for histograms." "") parser.add_option("-f", "--infile-map", dest="filename_input_map", type="string", help="input filename for mapping categories to values.") parser.add_option( "-n", "--norm-test", dest="norm_test", action="store_true", help= """test if a set of values is normally distributed. Mean and variance are calculated from the data.""") parser.add_option("-b", "--num-bins", dest="num_bins", type="int", help="""number of bins (for plotting purposes only).""") parser.add_option("--bin-size", dest="bin_size", type="float", help="""bin size for plot.""") parser.add_option("--min-value", dest="min_value", type="float", help="""minimum_value for plot.""") parser.add_option("--max-value", dest="max_value", type="float", help="""maximum_value for plot.""") parser.add_option("--skip-plot", dest="plot", action="store_false", help="""skipping plotting.""") parser.add_option("--header-names", dest="header", type="string", help="""header of value column [default=%default].""") parser.add_option("--title", dest="title", type="string", help="""plot title [default=%default].""") parser.set_defaults( method="ks", filename_input1=None, filename_input2=None, filename_input_map=None, legend=None, norm_test=False, num_bins=0, legend_range="2,2", bin_size=None, min_value=None, plot=True, header="value", title=None, ) (options, args) = E.Start(parser, add_pipe_options=True) kwargs = {} xargs = [] for arg in args: if "=" in arg: key, value = arg.split("=") kwargs[key] = value else: xargs.append(arg) if options.legend: options.legend = options.legend.split(",") map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"), map_functions=(str, float)) f = str else: f = float if options.filename_input1: infile1 = IOTools.openFile(options.filename_input1, "r") else: infile1 = sys.stdin values1, errors1 = IOTools.ReadList(infile1, map_function=f, map_category=map_category2value) if options.filename_input1: infile1.close() if errors1 and options.loglevel >= 3: options.stdlog.write("# errors in input1: %s\n" % ";".join(map(str, errors1))) if options.norm_test: mean = R.mean(values1) stddev = R.sd(values1) options.stdlog.write( "# creating %i samples from normal distribution with mean %f and stddev %f\n" % (len(values1), mean, stddev)) values2 = R.rnorm(len(values1), mean, stddev) errors2 = () else: values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"), map_function=f, map_category=map_category2value) if errors2 and options.loglevel >= 3: options.stdlog.write("# errors in input2: %s\n" % ";".join(map(str, errors2))) if options.loglevel >= 1: options.stdlog.write( "# ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i\n" % (len(values1), len(errors1), len(values2), len(errors2))) if options.method in ("paired-mwu", "paired-t"): if len(values1) != len(values2): raise ValueError( "number of values must be equal for paired tests.") if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test(values1, values2, *xargs, **kwargs) elif options.method == "mwu": result = R.wilcox_test(values1, values2, paired=False, correct=True, *xargs, **kwargs) elif options.method == "paired-mwu": result = R.wilcox_test(values1, values2, paired=True, correct=True, *xargs, **kwargs) elif options.method == "paired-t": result = R.t_test(values1, values2, paired=True, *xargs, **kwargs) elif options.method == "shapiro": if len(values1) > 5000: E.warn( "shapiro-wilk test only accepts < 5000 values, a random sample has been created." ) values1 = random.sample(values1, 5000) result = R.shapiro_test(values1, *xargs, **kwargs) if options.plot: R.assign("v1", values1) R.assign("v2", values2) if options.title: # set the size of the outer margins - the title needs to be added at the end # after plots have been created R.par(oma=R.c(0, 0, 4, 0)) R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True)) R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot") R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""" ) # compute breaks: min_value = min(min(values1), min(values2)) if options.min_value is not None: min_value = min(min_value, options.min_value) max_value = max(max(values1), max(values2)) if options.max_value is not None: max_value = max(max_value, options.max_value) extra_options = "" if options.num_bins and not (options.min_value or options.max_value): extra_options += ", breaks=%i" % options.num_bins elif options.num_bins and (options.min_value or options.max_value): bin_size = float((max_value - min_value)) / (options.num_bins + 1) breaks = [ min_value + x * bin_size for x in range(options.num_bins) ] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) elif options.bin_size is not None: num_bins = int(((max_value - min_value) / options.bin_size)) + 1 breaks = [ min_value + x * options.bin_size for x in range(num_bins + 1) ] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) R("""h1 <- hist( v1, freq=FALSE, density=20, main='Relative frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=FALSE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s)""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$density), max(h2$density)) / 2, c('%s'), fill=c('white','red'))""" % ("','".join(options.legend))) R("""h1 <- hist( v1, freq=TRUE, density=20, main='Absolute frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=TRUE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s )""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$counts), max(h2$counts)) / 2, c('%s'), fill=c('white','red'))""" % ("','".join(options.legend))) if options.title: R.mtext(options.title, 3, outer=True, line=1, cex=1.5) if options.loglevel >= 1: options.stdout.write("## Results for %s\n" % result['method']) options.stdout.write("%s\t%s\n" % ("key", options.header)) for key in list(result.keys()): if key == "data.name": continue options.stdout.write("\t".join((key, str(result[key]))) + "\n") stat = Stats.Summary(values1) for key, value in list(stat.items()): options.stdout.write("%s1\t%s\n" % (str(key), str(value))) stat = Stats.Summary(values2) for key, value in list(stat.items()): options.stdout.write("%s2\t%s\n" % (str(key), str(value))) if options.plot: if options.hardcopy: R.dev_off() E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-b", "--bin-size", dest="bin_size", type="string", help="bin size.") parser.add_option("--min-value", dest="min_value", type="float", help="minimum value for histogram.") parser.add_option( "--max-value", dest="max_value", type="float", help="maximum value for histogram.") parser.add_option( "--no-empty-bins", dest="no_empty_bins", action="store_true", help="do not display empty bins.") parser.add_option( "--with-empty-bins", dest="no_empty_bins", action="store_false", help="display empty bins.") parser.add_option( "--ignore-out-of-range", dest="ignore_out_of_range", action="store_true", help="ignore values that are out of range (as opposed to truncating " "them to range border.") parser.add_option("--missing-value", dest="missing_value", type="string", help="entry for missing values [%default].") parser.add_option("--use-dynamic-bins", dest="dynamic_bins", action="store_true", help="each value constitutes its own bin.") parser.add_option("--format", dest="format", type="choice", choices=("gff", "gtf", "bed"), help="input file format [%default].") parser.add_option("--method", dest="methods", type="choice", action="append", choices=("all", "hist", "stats", "overlaps", "values"), help="methods to apply [%default].") parser.add_option("--output-section", dest="output_section", type="choice", choices=("all", "size", "distance"), help="data to compute [%default].") parser.set_defaults( no_empty_bins=True, bin_size=None, dynamic_bins=False, ignore_out_of_range=False, min_value=None, max_value=None, nonull=None, missing_value="na", output_filename_pattern="%s", methods=[], output_section="all", format="gff", ) (options, args) = E.Start(parser, add_output_options=True) if "all" in options.methods: options.methods = ("hist", "stats", "overlaps") if not options.output_filename_pattern: options.output_filename_pattern = "%s" if len(options.methods) == 0: raise ValueError( "please provide counting method using --method option") if options.format in ("gff", "gtf"): gffs = GTF.iterator(options.stdin) elif options.format == "bed": gffs = Bed.iterator(options.stdin) values_between = [] values_within = [] values_overlaps = [] if "overlaps" in options.methods: if not options.output_filename_pattern: options.output_filename_pattern = "%s" outfile_overlaps = E.openOutputFile("overlaps") else: outfile_overlaps = None last = None ninput, noverlaps = 0, 0 for this in gffs: ninput += 1 values_within.append(this.end - this.start) if last and last.contig == this.contig: if this.start < last.end: noverlaps += 1 if outfile_overlaps: outfile_overlaps.write("%s\t%s\n" % (str(last), str(this))) values_overlaps.append( min(this.end, last.end) - max(last.start, this.start)) if this.end > last.end: last = this continue else: values_between.append(this.start - last.end) # if this.start - last.end < 10: # print str(last) # print str(this) # print "==" values_overlaps.append(0) last = this if "hist" in options.methods: outfile = E.openOutputFile("hist") h_within = Histogram.Calculate( values_within, no_empty_bins=options.no_empty_bins, increment=options.bin_size, min_value=options.min_value, max_value=options.max_value, dynamic_bins=options.dynamic_bins, ignore_out_of_range=options.ignore_out_of_range) h_between = Histogram.Calculate( values_between, no_empty_bins=options.no_empty_bins, increment=options.bin_size, min_value=options.min_value, max_value=options.max_value, dynamic_bins=options.dynamic_bins, ignore_out_of_range=options.ignore_out_of_range) if "all" == options.output_section: outfile.write("residues\tsize\tdistance\n") combined_histogram = Histogram.Combine( [h_within, h_between], missing_value=options.missing_value) Histogram.Write(outfile, combined_histogram, nonull=options.nonull) elif options.output_section == "size": outfile.write("residues\tsize\n") Histogram.Write(outfile, h_within, nonull=options.nonull) elif options.output_section == "distance": outfile.write("residues\tdistance\n") Histogram.Write(outfile, h_between, nonull=options.nonull) outfile.close() if "stats" in options.methods: outfile = E.openOutputFile("stats") outfile.write("data\t%s\n" % Stats.Summary().getHeader()) if options.output_section in ("size", "all"): outfile.write("size\t%s\n" % str(Stats.Summary(values_within))) if options.output_section in ("distance", "all"): outfile.write("distance\t%s\n" % str(Stats.Summary(values_between))) outfile.close() if "values" in options.methods: outfile = E.openOutputFile("distances") outfile.write("distance\n%s\n" % "\n".join(map(str, values_between))) outfile.close() outfile = E.openOutputFile("sizes") outfile.write("size\n%s\n" % "\n".join(map(str, values_within))) outfile.close() outfile = E.openOutputFile("overlaps") outfile.write("overlap\n%s\n" % "\n".join(map(str, values_overlaps))) outfile.close() E.info("ninput=%i, ndistance=%i, nsize=%i, noverlap=%i" % (ninput, len(values_between), len(values_within), noverlaps)) E.Stop()
def buildExpressionTracks(infile, outfiles, map_exp2columns, suffix): '''build expression tracks. read the analysis from FILENAME_EXPRESSION ..note:: The file A589_Data_RMA.csv does NOT always contain the probeset_id in the first column, but instead it might be the transcript_cluster_id. A possible explanation is that if several probesets map to the same transcript cluster, the transcript cluster is normalized. The set of cluster_id and probeset ids are completely non-overlapping. Hence, the :term:`cluster_id` will be used. ''' E.info("importing expression data from %s" % infile) dbhandle = sqlite3.connect(PARAMS["database"]) cc = dbhandle.cursor() statement = "SELECT DISTINCT probeset, cluster_id, transcript_id FROM probeset2transcript" cc.execute(statement) map_cluster2transcript, map_probeset2cluster = {}, {} for probeset, cluster, transcript_id in cc.fetchall(): map_probeset2cluster[probeset] = cluster map_cluster2transcript[cluster] = transcript_id reader = csv.reader(open(infile, "rU")) first = True # do not delete old files as this function is called several times output_files = IOTools.FilePool(output_pattern="exp%s.data", force=False) headers = (("Probe Set ID", "cluster_id"), ("Gene Symbol", "genesymbol"), ("mRna - Description", "description"), ('mRNA Accession', 'mrna_id'), ('mRNA Source', 'source'), ('mRNA - xhyb', 'xhyb'), ('GO Biological Process ID', 'go_biol_id'), ('GO Biological Process Term', 'go_biol_term'), ('GO Cellular Component ID', 'go_cell_id'), ('GO Cellular Component Term', 'go_cell_term'), ('GO Molecular Function ID', 'go_mol_id'), ('GO Molecular Function Term', 'go_mol_term'), ('Pathway Source', 'pw_source'), ('Pathway Name', 'pw_name')) old_headers = set([x[0] for x in headers]) new_headers = [x[1] for x in headers] take = [] index_soure, index_accession, index_probeset = None, None, None counts = E.Counter() found = set() outf = open(outfiles[0] + suffix, "w") outf.write("# %s\n" % infile) outs = open(outfiles[1] + suffix, "w") outs.write("# %s\n" % infile) writer = csv.writer(outf) for row in reader: if first: first = False writer.writerow(row) for x, old_header in enumerate(row): if old_header == "mRNA Source": index_source = len(take) if old_header == "mRNA Accession": index_accession = len(take) if old_header == "Probe Set ID": index_probeset = len(take) if old_header in old_headers: take.append(x) # write headers to all files outs.write("\t".join(new_headers) + "\n") for exp, columns in map_exp2columns.items(): output_files.write( exp, "\t".join( ("cluster_id", Stats.Summary().getHeader(), "\t".join( ["R%i" % i for i in range(len(columns))]))) + "\n") else: new_row = [] for x in take: if row[x].strip() != "---": new_row.append(row[x].strip()) else: new_row.append("") probeset = new_row[index_probeset].strip() if probeset in map_probeset2cluster: probeset = map_probeset2cluster[probeset] counter.mapped_to_cluster += 1 if probeset not in map_cluster2transcript: writer.writerow(row) counts.skipped += 1 continue else: if probeset in found: counts.duplicates += 1 counts.output += 1 found.add(probeset) outs.write("\t".join(new_row) + "\n") for exp, cols in map_exp2columns.items(): data = [row[x] for x in cols] output_files.write( exp, "\t".join( (probeset, str(Stats.Summary( [float(x) for x in data])), "\t".join(data))) + "\n") outf.close() if counts.duplicates > 0: P.warn("duplicate probeset/clusters") P.info("probeset source information: %s" % str(counts)) output_files.close()