def plot_manhattan_plot(dataframe, section, filename_fasta, map_key2label={}, **kwargs): plotter = ManhattanPlot(genome_size_file=filename_fasta) ax = plotter(dataframe, **kwargs) plt.savefig(E.get_output_file(section)) plt.close()
def plot_mutation_profile_bar_plot(dataframe, section, map_key2label={}, **kwargs): for key, dataframe in dataframe.groupby(by="sample"): if key == "unique": continue if dataframe.empty: E.warn("no data for {}".format(key)) continue ax = MutationProfileBarPlot()(dataframe) label = map_key2label.get(key, key) plt.savefig(E.get_output_file("-".join((section, label)))) plt.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("--extend", dest="extension", type="int", help="extend tags by this number of bases " "[default=%default].") parser.add_option("--shift-size", dest="shift", type="int", help="shift tags by this number of bases " "[default=%default].") parser.add_option("--window-size", dest="window_size", type="int", help="window size to be used in the analysis" "[default=%default].") parser.add_option("--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis " "[default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "enrichment", "dmr", "rms", "rpm", "all", "convert"), help="actions to perform [default=%default].") parser.add_option("-w", "--bigwig-file", dest="bigwig", action="store_true", help="store wig files as bigwig files - requires a " "genome file [default=%default]") parser.add_option("--treatment", dest="treatment_files", type="string", action="append", help="BAM files for treatment. At least one is required " "[%default]") parser.add_option("--control", dest="control_files", type="string", action="append", help="BAM files for control for differential " "methylation analysis. Optional [%default].") parser.add_option("--input", dest="input_files", type="string", action="append", help="BAM files for input correction. " "Optional [%default].") parser.add_option("--is-not-medip", dest="is_medip", action="store_false", help="data is not MeDIP data and is not expected " "to fit the calibration model. No CpG " "density normalized rms data is computed" "[default=%default].") parser.add_option("--output-rdata", dest="output_rdata", action="store_true", help="in dmr analysis, write R session to file. " "The file name " "is given by --ouptut-filename-pattern [%default].") parser.add_option("--rdata-file", dest="input_rdata", type="string", help="in dmr analysis, read saved R session from " "file. This can be used to apply different " "filters [%default]") parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float", help="FDR threshold to apply for selecting DMR " "[default=%default].") parser.add_option("--fdr-method", dest="fdr_method", type="choice", choices=("bonferroni", "BH", "holm", "hochberg", "hommel", "BY", "fdr", "none"), help="FDR method to apply for selecting DMR " "[default=%default].") parser.add_option("--bwa", dest="bwa", action="store_true", help="alignment generated with bwa" "[default=%default].") parser.add_option("--unique", dest="unique", type="float", help="Threshold p-value to determine which read pile\ ups are the result of PCR overamplification" "[default=%default].") parser.add_option("--chroms", dest="chroms", type="str", help="Comma delimited list of chromosomes to include" "[default=%default].") parser.set_defaults(input_format="bam", ucsc_genome="Hsapiens.UCSC.hg19", genome_file=None, extend=0, shift=0, window_size=300, saturation_iterations=10, toolset=[], bigwig=False, treatment_files=[], control_files=[], input_files=[], output_rdata=False, input_rdata=None, is_medip=True, fdr_threshold=0.1, fdr_method="BH", bwa=False, unique=0.001, chroms=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if "convert" in options.toolset: results = [] for line in CSV.DictReader(options.stdin, dialect="excel-tab"): if line['edgeR.p.value'] == "NA": continue # assumes only a single treatment/control treatment_name = options.treatment_files[0] control_name = options.control_files[0] status = "OK" try: results.append( Expression.GeneExpressionResult._make(( "%s:%i-%i" % (line['chr'], int(line['start']), int(line['stop'])), treatment_name, float(line['MSets1.rpkm.mean']), 0, control_name, float(line['MSets2.rpkm.mean']), 0, float(line['edgeR.p.value']), float(line['edgeR.adj.p.value']), float(line['edgeR.logFC']), math.pow(2.0, float(line['edgeR.logFC'])), float(line['edgeR.logFC']), # no transform ["0", "1"][float(line['edgeR.adj.p.value']) < options.fdr_threshold], status))) except ValueError as msg: raise ValueError("parsing error %s in line: %s" % (msg, line)) Expression.writeExpressionResults(options.stdout, results) return if len(options.treatment_files) < 1: raise ValueError("please specify a filename with sample data") if options.bigwig and not options.genome_file: raise ValueError("please provide a genome file when outputting bigwig") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() if len(options.toolset) == 0: options.toolset = ["all"] do_all = "all" in options.toolset if options.chroms is None: chrstring = "" else: chroms = options.chroms.split(",") chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms) # load MEDIPS R.library('MEDIPS') genome_file = 'BSgenome.%s' % options.ucsc_genome R.library(genome_file) window_size = options.window_size extend = options.extend shift = options.shift saturation_iterations = options.saturation_iterations uniq = float(options.unique) if options.bwa is True: BWA = "TRUE" else: BWA = "FALSE" if "saturation" in options.toolset or do_all: E.info("saturation analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''sr = MEDIPS.saturation( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, uniq=%(uniq)s, nit = %(saturation_iterations)i, paired = %(paired)s, bwa = %(BWA)s, %(chrstring)s nrit = 1)''' % locals()) R.png(E.get_output_file("%s_saturation.png" % fn)) R('''MEDIPS.plotSaturation(sr)''') R('''dev.off()''') R('''write.table(sr$estimation, file ='%s', sep='\t')''' % E.get_output_file("%s_saturation_estimation.tsv" % fn)) outfile = IOTools.open_file( E.get_output_file("%s_saturation.tsv" % fn), "w") outfile.write("category\tvalues\n") outfile.write("estimated_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxEstCor''')])) outfile.write("true_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxTruCor''')])) outfile.write("nreads\t%s\n" % ",".join(["%i" % x for x in R('''sr$numberReads''')])) outfile.close() if "coverage" in options.toolset or do_all: E.info("CpG coverage analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''cr = MEDIPS.seqCoverage( file='%(fn)s', BSgenome='%(genome_file)s', pattern='CG', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R.png(E.get_output_file("%s_cpg_coverage_pie.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''') R('''dev.off()''') R.png(E.get_output_file("%s_cpg_coverage_hist.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "hist", t=15)''') R('''dev.off()''') # note: this file is large R('''write.table(cr$cov.res, file=gzfile('%s','w'), sep='\t')''' % E.get_output_file("%s_saturation_coveredpos.tsv.gz" % fn)) if 'enrichment' in options.toolset or do_all: E.info("CpG enrichment analysis") outfile = IOTools.open_file(E.get_output_file("enrichment.tsv.gz"), "w") slotnames = (("regions.CG", "regions_CG", "%i"), ("regions.C", "regions_C", "%s"), ("regions.G", "regions_G", "%f"), ("regions.relH", "regions_relH", "%i"), ("regions.GoGe", "regions_GoGe", "%i"), ("genome.CG", "genome_CG", "%s"), ("genome.C", "genome_C", "%s"), ("genome.G", "genome_G", "%i"), ("genome.relH", "genome_relH", "%i"), ("enrichment.score.relH", "enrichment_relH", "%s"), ("enrichment.score.GoGe", "enrichment_GoGe", "%s")) outfile.write("\t".join(['sample'] + [x[1] for x in slotnames]) + "\n") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''ce = MEDIPS.CpGenrich( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) outfile.write("%s" % fn) for slotname, label, pattern in slotnames: value = tuple(R('''ce$%s''' % slotname)) if len(value) == 0: value = "" outfile.write("\t%s" % pattern % value[0]) outfile.write("\n") outfile.close() if options.input_rdata: E.info("reading R session info from '%s'" % options.input_rdata) R('''load('%s')''' % options.input_rdata) else: if "dmr" in options.toolset or "correlation" in options.toolset \ or do_all: # build four sets for x, fn in enumerate(options.treatment_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''treatment_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''treatment_set = c(%s)''' % ",".join([ "treatment_R%i" % x for x in range(len(options.treatment_files)) ])) if options.control_files: for x, fn in enumerate(options.control_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''control_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''control_set = c(%s)''' % ",".join([ "control_R%i" % x for x in range(len(options.control_files)) ])) # build coupling vector R('''CS = MEDIPS.couplingVector(pattern="CG", refObj = treatment_set[[1]])''') if "correlation" in options.toolset or do_all: R('''cor.matrix = MEDIPS.correlation( c(treatment_set, control_set))''') R('''write.table(cor.matrix, file='%s', sep="\t")''' % E.get_output_file("correlation")) if "dmr" in options.toolset or do_all: # Data that does not fit the model causes # "Error in 1:max_signal_index : argument of length 0" # The advice is to set MeDIP=FALSE # See: http://comments.gmane.org/ # gmane.science.biology.informatics.conductor/52319 if options.is_medip: medip = "TRUE" else: medip = "FALSE" fdr_method = options.fdr_method E.info("applying test for differential methylation") R('''meth = MEDIPS.meth( MSet1 = treatment_set, MSet2 = control_set, CSet = CS, ISet1 = NULL, ISet2 = NULL, p.adj = "%(fdr_method)s", diff.method = "edgeR", MeDIP = %(medip)s, CNV = F, minRowSum = 1)''' % locals()) # Note: several Gb in size # Output full methylation data table R('''write.table(meth, file=gzfile('%s', 'w'), sep="\t", row.names=F, quote=F)''' % E.get_output_file("data.tsv.gz")) # save R session if options.output_rdata: R('''save.image(file='%s', safe=FALSE)''' % E.get_output_file("session.RData")) # DMR analysis - test for windows and output if "dmr" in options.toolset: E.info("selecting differentially methylated windows") # test windows for differential methylation fdr_threshold = options.fdr_threshold R('''tested = MEDIPS.selectSig(meth, adj=T, ratio=NULL, p.value=%(fdr_threshold)f, bg.counts=NULL, CNV=F)''' % locals()) R('''write.table(tested, file=gzfile('%s', 'w'), sep="\t", quote=F)''' % E.get_output_file("significant_windows.gz")) # select gain and merge adjacent windows try: R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),]; gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''') E.info('gain output: %s, merged: %s' % (str(R('''dim(gain)''')), str(R('''dim(gain_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(gain_merged, file=of, sep="\t", quote=F, row.names=FALSE, col.names=FALSE); close(of)''' % E.get_output_file("gain.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute gain windows: msg=%s" % msg) # select loss and merge adjacent windows try: R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),]; loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''') E.info('loss output: %s, merged: %s' % (str(R('''dim(loss)''')), str(R('''dim(loss_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(loss_merged, file=of, sep="\t", quote=F, row.names=F, col.names=F); close(of)''' % E.get_output_file("loss.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute loss windows: msg=%s" % msg) # if "rpm" in options.toolset or do_all: # outputfile = E.get_output_file("rpm.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = T, descr = "rpm")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # if "rms" in options.toolset or do_all: # outputfile = E.get_output_file("rms.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = F, descr = "rms")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-e", "--input-bed-file", dest="input_bed_file", type="string", help="input file with intervals. Tab-delimited file of intervals " "in bed format to restrict analysis to. [%default]") parser.add_option( "-m", "--merge-intervals", dest="merge_intervals", action="store_true", help="merge intervals in bed file. Useful if you have a site bed-file " "[%default]") parser.add_option("-f", "--reference-fasta-file", dest="reference_fasta_file", help="reference genomic sequence in fasta format. " "[%default]") parser.add_option( "-c", "--barcode-fasta-file", dest="barcode_fasta_file", help="barcode sequence in fasta format. Variable positions " "should be marked by N " "[%default]") parser.set_defaults( reference_fasta_file=None, barcode_fasta_file=None, merge_intervals=False, input_bed_file=None, anchor=5, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if options.stdin != sys.stdin: bamfile = options.stdin.name elif args: if len(args) > 1: raise ValueError("multiple bam files provided in arguments") bamfile = args[0] else: bamfile = "-" if options.barcode_fasta_file: with pysam.FastxFile(options.barcode_fasta_file) as inf: barcode_sequence = next(inf).sequence else: barcode_sequence = None if not os.path.exists(options.reference_fasta_file): raise OSError("reference fasta file {} does not exist".format( options.reference_fasta_file)) if not os.path.exists(options.input_bed_file): raise OSError("input bed file {} does not exist".format( options.input_bed_file)) bed_in = pysam.TabixFile(options.input_bed_file) pysam_in = pysam.AlignmentFile(bamfile) anchor = options.anchor for region_idx, vals in enumerate( iterate_bed(bed_in, options.merge_intervals)): if region_idx > 0: raise NotImplementedError( "output for multiple regions not yet implemented") contig, region_start, region_end = vals upstream_anchors, downstream_anchors = [], [] counter = E.Counter() unaligned_fn = E.get_output_file( "unaligned_{}.fasta".format(region_idx)) with IOTools.open_file(unaligned_fn, "w") as outf: for read in pysam_in.fetch(contig, region_start, region_end): counter.overlapping_reads += 1 try: pairs = read.get_aligned_pairs(with_seq=True) except ValueError: counter.no_md_tag += 1 continue map_ref2read_pos = dict( (x[1], x[0]) for x in pairs if x[0] is not None) map_ref2ref_base = dict( (x[1], x[2]) for x in pairs if x[0] is not None) upstream_anchor = "".join( map_ref2ref_base.get(x, "") for x in range(region_start - anchor, region_start)) downstream_anchor = "".join( map_ref2ref_base.get(x, "") for x in range(region_end, region_end + anchor)) # check if at least one anchor is aligned upstream_matches = sum([x.isupper() for x in upstream_anchor]) downstream_matches = sum( [x.isupper() for x in downstream_anchor]) if upstream_matches < anchor and downstream_matches < anchor: counter.no_anchor += 1 continue seq = read.query_alignment_sequence # collect full length anchors upstream_anchor_start, upstream_anchor_end = region_start - anchor, region_start downstream_anchor_start, downstream_anchor_end = region_end, region_end + anchor if upstream_anchor_start in map_ref2read_pos and upstream_anchor_end in map_ref2read_pos: upstream_anchors.append( seq[map_ref2read_pos[upstream_anchor_start]: map_ref2read_pos[upstream_anchor_end]]) if downstream_anchor_start in map_ref2read_pos and downstream_anchor_end in map_ref2read_pos: downstream_anchors.append( seq[map_ref2read_pos[downstream_anchor_start]: map_ref2read_pos[downstream_anchor_end]]) # get region to align read_start = min( (map_ref2read_pos.get(x, len(seq)) for x in range(region_start - anchor, region_start))) if read_start == len(seq): read_start = 0 read_end = max( (map_ref2read_pos.get(x, 0) + 1 for x in range(region_end, region_end + anchor))) if read_end == 1: read_end = len(seq) counter.collected_reads += 1 outf.write(">{}/{}-{}\n{}\n".format(read.query_name, read_start, read_end, seq[read_start:read_end])) counter.downstream_anchors = len(downstream_anchors) counter.upstream_anchors = len(upstream_anchors) E.info(counter) if counter.overlapping_reads == 0: E.warn("no sequences overlapping region") continue if counter.downstream_anchors == 0 or counter.upstream_anchors == 0: E.warn("at least one anchor undefined") continue if counter.collected_reads == 1: E.warn("only single sequence, multiple aligment skipped") with IOTools.open_file(unaligned_fn) as inf: stdout = inf.read() else: # G-INS-i -> global alignment algorithm E.info("starting mafft multiple alignment") stdout = E.run( "mafft --globalpair --maxiterate 100 --quiet --op 2 --ep 0.5 {}" .format(unaligned_fn), return_stdout=True) aligned_fn = E.get_output_file("aligned_{}.fasta".format(region_idx)) with IOTools.open_file(aligned_fn, "w") as outf: outf.write(stdout) mali = stdout.splitlines() identifiers = [mali[x] for x in range(0, len(mali), 2)] sequences = [mali[x].upper() for x in range(1, len(mali), 2)] consensus = get_consensus(sequences) E.info("after alignment: consensus={}".format(consensus)) # gap filtering -> remove highly gappy columns consensus = get_consensus(sequences, min_gap_proportion=0.9) E.info("after anchor trimming: consensus={}".format(consensus)) take = [idx for idx, x in enumerate(consensus) if x != "-"] sequences = ["".join([s[x] for x in take]) for s in sequences] consensus = get_consensus(sequences, min_gap_proportion=0.9) E.info("after gap filtering: consensus={}".format(consensus)) # get anchor consensus and chop it off consensus = get_consensus(sequences, ignore_gaps=True) upstream_anchor = get_anchor_consensus(upstream_anchors) downstream_anchor = get_anchor_consensus(downstream_anchors) upstream_anchor_start = consensus.find(upstream_anchor) downstream_anchor_start = consensus.rfind(downstream_anchor) E.info( "anchor consensus (no gaps)={}, upstream={}, downstream={}, upstream_idx={}, downstream_idx={}" .format(consensus, upstream_anchor, downstream_anchor, upstream_anchor_start, downstream_anchor_start)) if upstream_anchor_start < 0 or downstream_anchor_start < 0: E.warn("can't locate anchor, no output produced") continue upstream_anchor_end = upstream_anchor_start + len(upstream_anchor) if upstream_anchor_end >= downstream_anchor_start: E.warn("anchor not in correct order, no output produced") continue sequences = [ x[upstream_anchor_end:downstream_anchor_start] for x in sequences ] consensus = get_consensus(sequences) E.info("after anchor trimming: consensus={}".format(consensus)) truncated_fn = E.get_output_file( "aligned_truncated_{}.fasta".format(region_idx)) with IOTools.open_file(truncated_fn, "w") as outf: outf.write("\n".join("{}\n{}\n".format(x, y) for x, y in zip(identifiers, sequences))) positions = list(zip(*sequences)) bases = ["A", "C", "G", "T"] df = pandas.DataFrame([collections.Counter(x) for x in positions]).fillna(0) for missing_base in [x for x in bases if x not in df.columns]: df[missing_base] = 0 df["gapped_depth"] = df.sum(axis=1) df["depth"] = df[bases].sum(axis=1) df["consensus"] = df[bases].idxmax(axis=1) df["consensus_counts"] = df.lookup(df.index, df.consensus) df["consensus_support"] = df.consensus_counts / df.depth df["offconsensus_counts"] = df.depth - df.consensus_counts df.loc[df.consensus_counts == 0, "consensus"] = "N" df["region_id"] = region_idx # replace "gap" consensus positions with + character alignment = global_align(re.sub("-", "+", consensus), barcode_sequence) E.info("alignment: consensus {}".format(alignment[0])) E.info("alignment: barcode {}".format(alignment[1])) barcode_idx = 0 deleted_barcode_bases = [] rows = [] for c, b in zip(*alignment): if c == "-": deleted_barcode_bases.append(barcode_idx) barcode_idx += 1 elif b == "N": rows.append((barcode_idx, "variable")) barcode_idx += 1 elif b == "-": rows.append(("", "insertion")) elif b == c: rows.append((barcode_idx, "fixed-match")) barcode_idx += 1 else: rows.append((barcode_idx, "fixed-mismatch")) barcode_idx += 1 alignment_df = pandas.DataFrame.from_records( rows, columns=["barcode_pos", "barcode_class"]) assert len(alignment_df) == len(df) df = pandas.concat([df, alignment_df], axis=1) with E.open_output_file("pileup") as outf: df.to_csv(outf, sep="\t", index=True, index_label="position") observed_barcode_sequence = "".join( df[df.barcode_class == "variable"].consensus) headers = df.consensus_support.describe().index eval_df = df.loc[df.barcode_class.isin( ("variable", "fixed-match", "fixed-mismatch")), ] median_consensus_depth = eval_df.consensus_counts.median() # zero stuff out if depth is low if median_consensus_depth <= 2: deleted_barcode_bases = [] outf = options.stdout # modules to recover partial bar-codes outf.write("\t".join( map(str, [ "barcode", "ndeleted_barcode_bases", "deleted_barcode_bases" ] + ["support_{}".format(x) for x in headers] + ["counts_{}".format(x) for x in headers] + ["offcounts_{}".format(x) for x in headers])) + "\n") outf.write("\t".join( map(str, [ observed_barcode_sequence, len(deleted_barcode_bases), ",".join( map(str, deleted_barcode_bases)) ] + eval_df.consensus_support.describe().tolist() + eval_df.consensus_counts.describe().tolist() + eval_df.offconsensus_counts.describe().tolist())) + "\n") E.stop()
def main(argv=sys.argv): # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--session", dest="session", type="string", help="load session before creating plots " "[%default]") parser.add_option("-d", "--snapshot-dir", dest="snapshotdir", type="string", help="directory to save snapshots in [%default]") parser.add_option("-f", "--format", dest="format", type="choice", choices=("png", "eps", "svg"), help="output file format [%default]") parser.add_option("-o", "--host", dest="host", type="string", help="host that IGV is running on [%default]") parser.add_option("-p", "--port", dest="port", type="int", help="port that IGV listens at [%default]") parser.add_option("-e", "--extend", dest="extend", type="int", help="extend each interval by a number of bases " "[%default]") parser.add_option("-x", "--expand", dest="expand", type="float", help="expand each region by a certain factor " "[%default]") parser.add_option("--session-only", dest="session_only", action="store_true", help="plot session after opening, " "ignore intervals " "[%default]") parser.add_option("-n", "--name", dest="name", type="choice", choices=("bed-name", "increment"), help="name to use for snapshot " "[%default]") parser.set_defaults( command="igv.sh", host='127.0.0.1', port=61111, snapshotdir=os.getcwd(), extend=0, format="png", expand=1.0, session=None, session_only=False, keep_open=False, name="bed-name", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) igv_process = None if options.new_instance: E.info("starting new IGV process") igv_process = IGV.startIGV(command=options.command, port=options.port) E.info("new IGV process started") E.info("connection to process on %s:%s" % (options.host, options.port)) E.info("saving images in %s" % options.snapshotdir) igv = IGV(host=options.host, port=options.port, snapshot_dir=os.path.abspath(options.snapshotdir)) if options.session: E.info('loading session from %s' % options.session) igv.load(options.session) E.info('loaded session') if options.session_only: E.info('plotting session only ignoring any intervals') fn = "%s.%s" % (os.path.basename(options.session), options.format) E.info("writing snapshot to '%s'" % os.path.join(options.snapshotdir, fn)) igv.save(fn) else: c = E.Counter() for bed in pysam.tabix_iterator(options.stdin, parser=pysam.asBed()): c.input += 1 # IGV can not deal with white-space in filenames if options.name == "bed-name": name = re.sub("\s", "_", bed.name) elif options.name == "increment": name = str(c.input) E.info("going to %s:%i-%i for %s" % (bed.contig, bed.start, bed.end, name)) start, end = bed.start, bed.end extend = options.extend if options.expand: d = end - start extend = max(extend, (options.expand * d - d) // 2) start -= extend end += extend igv.go("%s:%i-%i" % (bed.contig, start, end)) fn = E.get_output_file("%s.%s" % (name, options.format)) E.info("writing snapshot to '%s'" % fn) igv.save(fn) c.snapshots += 1 E.info(c) if igv_process is not None and not options.keep_open: E.info('shutting down IGV') igv_process.send_signal(signal.SIGKILL) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=( "geneprofile", "tssprofile", "utrprofile", "intervalprofile", "midpointprofile", "geneprofilewithintrons", "geneprofileabsolutedistancefromthreeprimeend", "separateexonprofile", "separateexonprofilewithintrons", ), help='counters to use. Counters describe the ' 'meta-gene structure to use. ' 'Note using geneprofilewithintrons, or ' 'geneprofileabsolutedistancefromthreeprimeend will ' 'automatically turn on the --use-base-accuracy option' '[%default].') parser.add_option("-b", "--bam-file", "--bedfile", "--bigwigfile", dest="infiles", metavar="BAM", type="string", action="append", help="BAM/bed/bigwig files to use. Do not mix " "different types [%default]") parser.add_option("-c", "--control-bam-file", dest="controlfiles", metavar="BAM", type="string", action="append", help="control/input to use. Should be of the same " "type as the bam/bed/bigwig file" " [%default]") parser.add_option("-g", "--gtf-file", dest="gtffile", type="string", metavar="GTF", help="GTF file to use. " "[%default]") parser.add_option("--normalize-transcript", dest="transcript_normalization", type="choice", choices=("none", "max", "sum", "total-max", "total-sum"), help="normalization to apply on each transcript " "profile before adding to meta-gene profile. " "[%default]") parser.add_option("--normalize-profile", dest="profile_normalizations", type="choice", action="append", choices=("all", "none", "area", "counts", "background"), help="normalization to apply on meta-gene " "profile normalization. " "[%default]") parser.add_option( "-r", "--reporter", dest="reporter", type="choice", choices=("gene", "transcript"), help="report results for genes or transcripts." " When 'genes` is chosen, exons across all transcripts for" " a gene are merged. When 'transcript' is chosen, counts are" " computed for each transcript separately with each transcript" " contributing equally to the meta-gene profile." " [%default]") parser.add_option("-i", "--shift-size", dest="shifts", type="int", action="append", help="shift reads in :term:`bam` formatted file " "before computing densities (ChIP-Seq). " "[%default]") parser.add_option("-a", "--merge-pairs", dest="merge_pairs", action="store_true", help="merge pairs in :term:`bam` formatted " "file before computing " "densities (ChIP-Seq). " "[%default]") parser.add_option("-u", "--use-base-accuracy", dest="base_accuracy", action="store_true", help="compute densities with base accuracy. The default " "is to only use the start and end of the aligned region " "(RNA-Seq) " "[%default]") parser.add_option("-e", "--extend", dest="extends", type="int", action="append", help="extend reads in :term:`bam` formatted file " "(ChIP-Seq). " "[%default]") parser.add_option("--resolution-upstream", dest="resolution_upstream", type="int", help="resolution of upstream region in bp " "[%default]") parser.add_option("--resolution-downstream", dest="resolution_downstream", type="int", help="resolution of downstream region in bp " "[%default]") parser.add_option("--resolution-upstream-utr", dest="resolution_upstream_utr", type="int", help="resolution of upstream UTR region in bp " "[%default]") parser.add_option("--resolution-downstream-utr", dest="resolution_downstream_utr", type="int", help="resolution of downstream UTR region in bp " "[%default]") parser.add_option("--resolution-cds", dest="resolution_cds", type="int", help="resolution of cds region in bp " "[%default]") parser.add_option("--resolution-first-exon", dest="resolution_first", type="int", help="resolution of first exon in gene, in bp" "[%default]") parser.add_option("--resolution-last-exon", dest="resolution_last", type="int", help="resolution of last exon in gene, in bp" "[%default]") parser.add_option("--resolution-introns", dest="resolution_introns", type="int", help="resolution of introns region in bp " "[%default]") parser.add_option("--resolution-exons-absolute-distance-topolya", dest="resolution_exons_absolute_distance_topolya", type="int", help="resolution of exons absolute distance " "topolya in bp " "[%default]") parser.add_option("--resolution-introns-absolute-distance-topolya", dest="resolution_introns_absolute_distance_topolya", type="int", help="resolution of introns absolute distance " "topolya in bp " "[%default]") parser.add_option("--extension-exons-absolute-distance-topolya", dest="extension_exons_absolute_distance_topolya", type="int", help="extension for exons from the absolute " "distance from the topolya in bp " "[%default]") parser.add_option( "--extension-introns-absolute-distance-topolya", dest="extension_introns_absolute_distance_topolya", type="int", help="extension for introns from the absolute distance from " "the topolya in bp [%default]") parser.add_option("--extension-upstream", dest="extension_upstream", type="int", help="extension upstream from the first exon in bp" "[%default]") parser.add_option("--extension-downstream", dest="extension_downstream", type="int", help="extension downstream from the last exon in bp" "[%default]") parser.add_option("--extension-inward", dest="extension_inward", type="int", help="extension inward from a TSS start site in bp" "[%default]") parser.add_option("--extension-outward", dest="extension_outward", type="int", help="extension outward from a TSS start site in bp" "[%default]") parser.add_option("--scale-flank-length", dest="scale_flanks", type="int", help="scale flanks to (integer multiples of) gene length" "[%default]") parser.add_option( "--control-factor", dest="control_factor", type="float", help="factor for normalizing control and foreground data. " "Computed from data if not set. " "[%default]") parser.add_option("--output-all-profiles", dest="output_all_profiles", action="store_true", help="keep individual profiles for each " "transcript and output. " "[%default]") parser.add_option("--counts-tsv-file", dest="input_filename_counts", type="string", help="filename with count data for each transcript. " "Use this instead " "of recomputing the profile. Useful for plotting the " "meta-gene profile " "from previously computed counts " "[%default]") parser.add_option( "--background-region-bins", dest="background_region_bins", type="int", help="number of bins on either end of the profile " "to be considered for background meta-gene normalization " "[%default]") parser.set_defaults( remove_rna=False, ignore_pairs=False, force_output=False, bin_size=10, extends=[], shifts=[], sort=[], reporter="transcript", resolution_cds=1000, resolution_introns=1000, # 3kb is a good balance of seeing long enough 3 prime bias and not omit # too many genes. Tim 31th Aug 2013 resolution_exons_absolute_distance_topolya=3000, # introns is only for assess the noise level, thus do ont need a long # region, a long region has the side effect of omit more genes. Tim # 31th Aug 2013 resolution_introns_absolute_distance_topolya=500, # extension can simply just be the same as resolution extension_exons_absolute_distance_topolya=3000, extension_introns_absolute_distance_topolya=500, resolution_upstream_utr=1000, resolution_downstream_utr=1000, resolution_upstream=1000, resolution_downstream=1000, resolution_first=1000, resolution_last=1000, # mean length of transcripts: about 2.5 kb extension_upstream=2500, extension_downstream=2500, extension_inward=3000, extension_outward=3000, plot=True, methods=[], infiles=[], controlfiles=[], gtffile=None, profile_normalizations=[], transcript_normalization=None, scale_flanks=0, merge_pairs=False, min_insert_size=0, max_insert_size=1000, base_accuracy=False, matrix_format="single", control_factor=None, output_all_profiles=False, background_region_bins=10, input_filename_counts=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) # Keep for backwards compatability if len(args) == 2: infile, gtf = args options.infiles.append(infile) options.gtffile = gtf if not options.gtffile: raise ValueError("no GTF file specified") if options.gtffile == "-": options.gtffile = options.stdin else: options.gtffile = IOTools.open_file(options.gtffile) if len(options.infiles) == 0: raise ValueError("no bam/wig/bed files specified") for methodsRequiresBaseAccuracy in [ "geneprofilewithintrons", "geneprofileabsolutedistancefromthreeprimeend", ]: # If you implemented any methods that you do not want the # spliced out introns or exons appear to be covered by # non-existent reads, it is better you let those methods imply # --base-accurarcy by add them here. if methodsRequiresBaseAccuracy in options.methods: options.base_accuracy = True if options.reporter == "gene": gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(options.gtffile)) elif options.reporter == "transcript": gtf_iterator = GTF.transcript_iterator(GTF.iterator(options.gtffile)) # Select rangecounter based on file type if len(options.infiles) > 0: if options.infiles[0].endswith(".bam"): bamfiles = [pysam.AlignmentFile(x, "rb") for x in options.infiles] if options.controlfiles: controlfiles = [ pysam.AlignmentFile(x, "rb") for x in options.controlfiles ] else: controlfiles = None format = "bam" if options.merge_pairs: range_counter = geneprofile.RangeCounterBAM( bamfiles, shifts=options.shifts, extends=options.extends, merge_pairs=options.merge_pairs, min_insert_size=options.min_insert_size, max_insert_size=options.max_insert_size, controfiles=controlfiles, control_factor=options.control_factor) elif options.shifts or options.extends: range_counter = geneprofile.RangeCounterBAM( bamfiles, shifts=options.shifts, extends=options.extends, controlfiles=controlfiles, control_factor=options.control_factor) elif options.base_accuracy: range_counter = geneprofile.RangeCounterBAMBaseAccuracy( bamfiles, controlfiles=controlfiles, control_factor=options.control_factor) else: range_counter = geneprofile.RangeCounterBAM( bamfiles, controlfiles=controlfiles, control_factor=options.control_factor) elif options.infiles[0].endswith(".bed.gz"): bedfiles = [pysam.Tabixfile(x) for x in options.infiles] if options.controlfiles: controlfiles = [ pysam.Tabixfile(x) for x in options.controlfiles ] else: controlfiles = None range_counter = geneprofile.RangeCounterBed( bedfiles, controlfiles=controlfiles, control_factor=options.control_factor) elif options.infiles[0].endswith(".bw"): wigfiles = [BigWigFile(file=open(x)) for x in options.infiles] range_counter = geneprofile.RangeCounterBigWig(wigfiles) else: raise NotImplementedError("can't determine file type for %s" % str(options.infiles)) counters = [] for method in options.methods: if method == "utrprofile": counters.append( geneprofile.UTRCounter( range_counter, options.resolution_upstream, options.resolution_upstream_utr, options.resolution_cds, options.resolution_downstream_utr, options.resolution_downstream, options.extension_upstream, options.extension_downstream, )) elif method == "geneprofile": counters.append( geneprofile.GeneCounter( range_counter, options.resolution_upstream, options.resolution_cds, options.resolution_downstream, options.extension_upstream, options.extension_downstream, options.scale_flanks)) elif method == "geneprofilewithintrons": counters.append( geneprofile.GeneCounterWithIntrons( range_counter, options.resolution_upstream, options.resolution_cds, options.resolution_introns, options.resolution_downstream, options.extension_upstream, options.extension_downstream, options.scale_flanks)) elif method == "geneprofileabsolutedistancefromthreeprimeend": # options.extension_exons_absolute_distance_tostartsite, # options.extension_introns_absolute_distance_tostartsite, # Tim 31th Aug 2013: a possible feature for future, if five prime # bias is of your interest. # (you need to create another class). It is not very difficult to # derive from this class, but is not implemented yet # This future feature is slightly different the TSS profile # already implemented, because in this future feature introns are # skipped, counters.append( geneprofile.GeneCounterAbsoluteDistanceFromThreePrimeEnd( range_counter, options.resolution_upstream, options.resolution_downstream, options.resolution_exons_absolute_distance_topolya, options.resolution_introns_absolute_distance_topolya, options.extension_upstream, options.extension_downstream, options.extension_exons_absolute_distance_topolya, options.extension_introns_absolute_distance_topolya, options.scale_flanks)) elif method == "tssprofile": counters.append( geneprofile.TSSCounter(range_counter, options.extension_outward, options.extension_inward)) elif method == "intervalprofile": counters.append( geneprofile.RegionCounter(range_counter, options.resolution_upstream, options.resolution_cds, options.resolution_downstream, options.extension_upstream, options.extension_downstream)) elif method == "midpointprofile": counters.append( geneprofile.MidpointCounter(range_counter, options.resolution_upstream, options.resolution_downstream, options.extension_upstream, options.extension_downstream)) # add new method to split 1st and last exons out # requires a representative transcript for reach gene # gtf should be sorted gene-position elif method == "separateexonprofile": counters.append( geneprofile.SeparateExonCounter( range_counter, options.resolution_upstream, options.resolution_first, options.resolution_last, options.resolution_cds, options.resolution_downstream, options.extension_upstream, options.extension_downstream)) elif method == "separateexonprofilewithintrons": counters.append( geneprofile.SeparateExonWithIntronCounter( range_counter, options.resolution_upstream, options.resolution_first, options.resolution_last, options.resolution_cds, options.resolution_introns, options.resolution_downstream, options.extension_upstream, options.extension_downstream)) # set normalization for c in counters: c.setNormalization(options.transcript_normalization) if options.output_all_profiles: c.setOutputProfiles( IOTools.open_file( E.get_output_file(c.name) + ".profiles.tsv.gz", "w")) if options.input_filename_counts: # read counts from file E.info("reading counts from %s" % options.input_filename_counts) all_counts = pandas.read_csv(IOTools.open_file( options.input_filename_counts), sep='\t', header=0, index_col=0) if len(counters) != 1: raise NotImplementedError( 'counting from matrix only implemented for 1 counter.') # build counter based on reference counter counter = geneprofile.UnsegmentedCounter(counters[0]) counters = [counter] geneprofile.countFromCounts(counters, all_counts) else: E.info("starting counting with %i counters" % len(counters)) feature_names = geneprofile.countFromGTF(counters, gtf_iterator) # output matrices if not options.profile_normalizations: options.profile_normalizations.append("none") elif "all" in options.profile_normalizations: options.profile_normalizations = [ "none", "area", "counts", "background" ] for method, counter in zip(options.methods, counters): profiles = [] for norm in options.profile_normalizations: # build matrix, apply normalization profile = counter.getProfile( normalize=norm, background_region_bins=options.background_region_bins) profiles.append(profile) for x in range(1, len(profiles)): assert profiles[0].shape == profiles[x].shape # build a single matrix of all profiles for output matrix = numpy.concatenate(profiles) matrix.shape = len(profiles), len(profiles[0]) matrix = matrix.transpose() with IOTools.open_file( E.get_output_file(counter.name) + ".matrix.tsv.gz", "w") as outfile: outfile.write("bin\tregion\tregion_bin\t%s\n" % "\t".join(options.profile_normalizations)) fields = [] bins = [] for field, nbins in zip(counter.fields, counter.nbins): fields.extend([field] * nbins) bins.extend(list(range(nbins))) for row, cols in enumerate(zip(fields, bins, matrix)): outfile.write("%i\t%s\t" % (row, "\t".join([str(x) for x in cols[:-1]]))) outfile.write("%s\n" % ("\t".join([str(x) for x in cols[-1]]))) with IOTools.open_file( E.get_output_file(counter.name) + ".lengths.tsv.gz", "w") as outfile: counter.writeLengthStats(outfile) if options.output_all_profiles: counter.closeOutputProfiles() if options.plot: import matplotlib # avoid Tk or any X matplotlib.use("Agg") import matplotlib.pyplot as plt for method, counter in zip(options.methods, counters): if method in ("geneprofile", "geneprofilewithintrons", "geneprofileabsolutedistancefromthreeprimeend", "utrprofile", "intervalprofile", "separateexonprofile", "separateexonprofilewithintrons"): plt.figure() plt.subplots_adjust(wspace=0.05) max_scale = max([max(x) for x in counter.aggregate_counts]) for x, counts in enumerate(counter.aggregate_counts): plt.subplot(6, 1, x + 1) plt.plot(list(range(len(counts))), counts) plt.title(counter.fields[x]) plt.ylim(0, max_scale) figname = counter.name + ".full" fn = E.get_output_file(figname) + ".png" plt.savefig(os.path.expanduser(fn)) plt.figure() points = [] cuts = [] for x, counts in enumerate(counter.aggregate_counts): points.extend(counts) cuts.append(len(counts)) plt.plot(list(range(len(points))), points) xx, xxx = 0, [] for x in cuts: xxx.append(xx + x // 2) xx += x plt.axvline(xx, color="r", ls="--") plt.xticks(xxx, counter.fields) figname = counter.name + ".detail" fn = E.get_output_file(figname) + ".png" plt.savefig(os.path.expanduser(fn)) elif method == "tssprofile": plt.figure() plt.subplot(1, 3, 1) plt.plot( list( range(-options.extension_outward, options.extension_inward)), counter.aggregate_counts[0]) plt.title(counter.fields[0]) plt.subplot(1, 3, 2) plt.plot( list( range(-options.extension_inward, options.extension_outward)), counter.aggregate_counts[1]) plt.title(counter.fields[1]) plt.subplot(1, 3, 3) plt.title("combined") plt.plot( list( range(-options.extension_outward, options.extension_inward)), counter.aggregate_counts[0]) plt.plot( list( range(-options.extension_inward, options.extension_outward)), counter.aggregate_counts[1]) plt.legend(counter.fields[:2]) fn = E.get_output_file(counter.name) + ".png" plt.savefig(os.path.expanduser(fn)) elif method == "midpointprofile": plt.figure() plt.plot(numpy.arange(-options.resolution_upstream, 0), counter.aggregate_counts[0]) plt.plot(numpy.arange(0, options.resolution_downstream), counter.aggregate_counts[1]) fn = E.get_output_file(counter.name) + ".png" plt.savefig(os.path.expanduser(fn)) # write footer and output benchmark information. E.stop()
def plot_depth_profile_plot(dataframe, section, map_key2label={}, **kwargs): ax = DepthProfilePlot()(dataframe, map_sample2label={}) plt.savefig(E.get_output_file(section)) plt.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-e", "--exclusive-overlap", dest="exclusive", action="store_true", help="Intervals reported will be merged across the " "positive set and do not overlap any interval in any of the " "other sets [default=%default].") parser.add_option("-p", "--pattern-identifier", dest="pattern_id", type="string", help="pattern to convert a filename " "to an id [default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=("merged-combinations", "unmerged-combinations"), help="method to perform [default=%default]") parser.set_defaults( pattern_id="(.*).bed.gz", exclusive=False, method="merged-combinations", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if len(args) < 2: raise ValueError("at least two arguments required") tags, bedfiles = [], [] for infile in args: bedfiles.append(pysam.Tabixfile(infile, "r")) tags.append(re.search(options.pattern_id, infile).groups()[0]) indices = list(range(len(bedfiles))) is_exclusive = options.exclusive if options.method == "merged-combinations": if is_exclusive: start = 1 else: start = 2 options.stdout.write("combination\twithout\tcounts\n") for ncombinants in range(start, len(bedfiles) + 1): for combination in itertools.combinations(indices, ncombinants): other = [x for x in indices if x not in combination] tag = ":".join([tags[x] for x in combination]) E.debug("combination %s started" % tag) E.debug("other: %s" % ":".join([tags[x] for x in other])) other_bed = [bedfiles[x] for x in other] outf = IOTools.open_file(E.get_output_file(tag), "w", create_dir=True) c = E.Counter() for contig, start, end in combineMergedIntervals( [bedfiles[x] for x in combination]): c.found += 1 if is_exclusive and isContainedInOne( contig, start, end, other_bed): c.removed += 1 continue c.output += 1 outf.write("%s\t%i\t%i\n" % (contig, start, end)) outf.close() E.info("combination %s finished: %s" % (tag, c)) options.stdout.write("%s\t%s\t%i\n" % (":".join([tags[x] for x in combination]), ":".join([tags[x] for x in other]), c.output)) elif options.method == "unmerged-combinations": options.stdout.write("track\tcombination\twithout\tcounts\n") for foreground in indices: start = 0 background = [x for x in indices if x != foreground] for ncombinants in range(0, len(background) + 1): for combination in itertools.combinations( background, ncombinants): other = [x for x in background if x not in combination] combination_bed = [bedfiles[x] for x in combination] other_bed = [bedfiles[x] for x in other] tag = ":".join([tags[foreground]] + [tags[x] for x in combination]) E.debug("fg=%i, combination=%s, other=%s" % (foreground, combination, other)) E.debug("combination %s started" % tag) E.debug("other: %s" % ":".join([tags[x] for x in other])) outf = IOTools.open_file(E.get_output_file(tag), "w", create_dir=True) c = E.Counter() for bed in combineUnmergedIntervals( bedfiles[foreground], combination_bed): c.found += 1 if is_exclusive and isContainedInOne( bed.contig, bed.start, bed.end, other_bed): c.removed += 1 continue c.output += 1 outf.write("%s\n" % str(bed)) outf.close() E.info("combination %s finished: %s" % (tag, c)) options.stdout.write( "%s\t%s\t%s\t%i\n" % (tags[foreground], ":".join([ tags[x] for x in combination ]), ":".join([tags[x] for x in other]), c.output)) E.stop()