def readAndIndex(iterator, with_value=True): '''read from gtf stream and index. returns an :class:`IndexedGenome.IndexedGenome` ''' if with_value: index = IndexedGenome.IndexedGenome() for gtf in iterator: index.add(gtf.contig, gtf.start, gtf.end, gtf) else: index = IndexedGenome.Simple() for gtf in iterator: index.add(gtf.contig, gtf.start, gtf.end) return index
def __call__(self, track, slice = None ): result = odict() merged = None rocs = [] for field in self.mFields: data = [] for replicate in EXPERIMENTS.getTracks( track ): statement = "SELECT contig, start, end,%(field)s FROM %(replicate)s_intervals" % locals() data.append( self.get( statement) ) idx = [] for x in range(len(data)): i = IndexedGenome.IndexedGenome() for contig, start, end, peakval in data[x]: i.add( contig, start, end, peakval ) idx.append( i ) def _iter( all ): all.sort() last_contig, first_start, last_end, last_value = all[0] for contig, start, end, value in all[1:]: if contig != last_contig or last_end < start: yield (last_contig, first_start, last_end) last_contig, first_start, last_end = contig, start, end else: last_end = max(last_end, end ) yield (last_contig, first_start, last_end) if not merged: all = [ x for x in itertools.chain( *data ) ] merged = list( _iter(all) ) roc_data = [] for contig, start, end in merged: intervals = [] for i in idx: try: intervals.append( list(i.get( contig, start, end )) ) except KeyError: continue if len(intervals) == 0: continue is_repro = len( [ x for x in intervals if x != [] ] ) == len(data) value = max( [ x[2] for x in itertools.chain( *intervals )] ) # fpr, tpr roc_data.append( (value, is_repro) ) roc_data.sort() roc_data.reverse() roc = zip(*Stats.computeROC( roc_data )) result[field] = odict( (("FPR", roc[0]), (field,roc[1])) ) return result
def __init__(self, filename_junctions, *args, **kwargs): BaseAnnotator.__init__(self, *args, **kwargs) junctions = IndexedGenome.IndexedGenome() infile = open(filename_junctions, "r") njunctions = 0 for line in infile: if line.startswith("#"): continue data = line[:-1].split("\t") if data[0] == "contig": continue # end, start are the positions of the last base of the codon # 5' of the intron and first base of codon 3' of the intron. contig, strand, end, start, frame, gene_id, transcript_id = data start, end, frame = int(start), int(end), int(frame) # convert to intron coordinates intron_start, intron_end = end + 1, start # convert to positive strand coordinates if strand == "-": lcontig = self.mFasta.getLength(contig) intron_start, intron_end = lcontig - intron_end, lcontig - intron_start junctions.add( contig, intron_start, intron_start + self.mSize, (strand, intron_start, intron_end, gene_id, transcript_id)) junctions.add( contig, intron_end - self.mSize, intron_end, (strand, intron_start, intron_end, gene_id, transcript_id)) njunctions += 1 infile.close() self.mJunctions = junctions E.info("read and indexed %i junctions for %i contigs" % (njunctions, len(junctions)))
def readIntervals(infile, options): ninput = 0 t = time.time() if options.format == "gtf": index = IndexedGenome.IndexedGenome() for gffs in GTF.transcript_iterator(GTF.iterator(infile)): ali = alignlib_lite.py_makeAlignmentBlocks() for gff in gffs: if gff.feature != "exon": continue ali.addDiagonal(gff.start, gff.end, 0) index.add(min([x.start for x in gffs]), max([x.end for x in gffs]), ali) ninput += 1 if ninput % options.report_step == 0: E.info( "reading intervals - progress: ninput=%i, time=%i, avg=%f" % (ninput, time.time() - t, float(time.time() - t) / ninput)) elif options.format == "gff": index = IndexedGenome.Simple() for g in GTF.iterator(infile): index.add(g.contig, g.start, g.end) ninput += 1 if ninput % options.report_step == 0: E.info( "reading intervals - progress: ninput=%i, time=%i, avg=%f" % (ninput, time.time() - t, float(time.time() - t) / ninput)) E.info("read intervals: %i contigs, %i intervals" % (len(index), ninput)) return index
def __init__(self, filename_exons, *args, **kwargs): BaseAnnotator.__init__(self, *args, **kwargs) exons = IndexedGenome.IndexedGenome() nexons = 0 for g in GTF.iterator(open(filename_exons, "r")): exons.add(g.contig, g.start, g.end, g) nexons += 1 self.mExons = exons E.info("indexed %i exons on %i contigs" % (nexons, len(exons)))
def buildQuicksectMask(bed_file): '''return Quicksect object containing the regions specified takes a bed file listing the regions to mask ''' mask = IndexedGenome.Quicksect() n_regions = 0 for bed in Bed.iterator(IOTools.openFile(bed_file)): # it is neccessary to extend the region to make an accurate mask mask.add(bed.contig, (bed.start - 1), (bed.end + 1), 1) n_regions += 1 E.info("Built Quicksect mask for %i regions" % n_regions) return(mask)
def makeIntervalCorrelation(infiles, outfile, field, reference): '''compute correlation of interval properties between sets ''' dbhandle = sqlite3.connect(PARAMS["database_name"]) tracks, idx = [], [] for infile in infiles: track = P.snip(infile, ".bed.gz") tablename = "%s_intervals" % P.tablequote(track) cc = dbhandle.cursor() statement = "SELECT contig, start, end, %(field)s FROM %(tablename)s" % locals( ) cc.execute(statement) ix = IndexedGenome.IndexedGenome() for contig, start, end, peakval in cc: ix.add(contig, start, end, peakval) idx.append(ix) tracks.append(track) outs = IOTools.openFile(outfile, "w") outs.write("contig\tstart\tend\tid\t" + "\t".join(tracks) + "\n") for bed in Bed.iterator(infile=IOTools.openFile(reference, "r")): row = [] for ix in idx: try: intervals = list(ix.get(bed.contig, bed.start, bed.end)) except KeyError: row.append("") continue if len(intervals) == 0: peakval = "" else: peakval = str((max([x[2] for x in intervals]))) row.append(peakval) outs.write(str(bed) + "\t" + "\t".join(row) + "\n") outs.close()
def annotateWindows(contig, windows, gff_data, fasta, options): """annotate windows.""" index = IndexedGenome.IndexedGenome() for g in gff_data: index.add(g.contig, g.start, g.end, g) is_gtf = options.is_gtf if options.transform == "none": transform = lambda x, y, z: [(x[0], x[1]) for x in z] elif options.transform == "overlap": transform = transform_overlap elif options.transform == "complement": transform = transform_complement elif options.transform == "third_codon": transform = transform_third_codon else: raise ValueError("unknown transform %s" % options.transform) work_on_intervals = True if options.decorator == "counts": decorator = decorator_counts elif options.decorator == "mean-length": decorator = decorator_mean_length elif options.decorator == "median-length": decorator = decorator_median_length elif options.decorator == "percent-coverage": decorator = decorator_percent_coverage elif options.decorator == "gc": decorator = decorator_percent_gc elif options.decorator == "median-score": decorator = decorator_median_score work_on_intervals = False elif options.decorator == "mean-score": decorator = decorator_mean_score work_on_intervals = False elif options.decorator == "stddev-score": decorator = decorator_stddev_score work_on_intervals = False elif options.decorator == "min-score": decorator = decorator_min_score work_on_intervals = False elif options.decorator == "max-score": decorator = decorator_max_score work_on_intervals = False else: raise ValueError("unknown decorator %s" % options.decorator) for start, end in windows: # counts/length before/after transformation n1, l1, n2, l2 = 0, 0, 0, 0 values, intervals_with_gff, genes, transcripts = [], [], set(), set() try: for istart, iend, value in index.get(contig, start, end): n1 += 1 l1 += iend - istart intervals_with_gff.append((istart, iend, value)) values.append(value.score) if is_gtf: genes.add(value.gene_id) transcripts.add(value.transcript_id) except KeyError: pass if n1 == 0 and options.skip_empty: continue if work_on_intervals: if options.loglevel >= 3: options.stdlog.write( "# intervals in window %i:%i before transformation: %s\n" % (start, end, str(intervals))) intervals = transform(start, end, intervals_with_gff) for xstart, xend in intervals: n2 += 1 l2 += xend - xstart if options.loglevel >= 3: options.stdlog.write( "# intervals in window %i:%i after transformation: %s\n" % (start, end, str(intervals))) score, extra_info = decorator(intervals, start, end, contig, fasta) else: if len(values) > 0: values = list(map(float, values)) score, extra_info = decorator(values, start, end, contig) else: score, extra_info = 0, None l2 = 0 n2 = 0 if is_gtf: ngenes, ntranscripts = len(genes), len(transcripts) else: ngenes, ntranscripts = 0, 0 if extra_info: extra_info = re.sub("\t", ";", extra_info) options.stdout.write("\t".join( map(str, (contig, start, end, ngenes, ntranscripts, n1, l1, n2, l2, score, extra_info))) + "\n")
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-g", "--gtf-file", dest="filename_gtf", type="string", help="filename with gene models in gtf format [%default]") parser.add_option( "-m", "--filename-mismapped", dest="filename_mismapped", type="string", help="output bam file for mismapped reads [%default]") parser.add_option( "-j", "--junctions-bed-file", dest="filename_junctions", type="string", help="bam file with reads mapped across junctions [%default]") parser.add_option( "-r", "--filename-regions", dest="filename_regions", type="string", help="filename with regions to remove in bed format [%default]") parser.add_option( "-t", "--transcripts-gtf-file", dest="filename_transcriptome", type="string", help="bam file with reads mapped against transcripts [%default]") parser.add_option( "-p", "--map-tsv-file", dest="filename_map", type="string", help="filename mapping transcript numbers (used by " "--filename-transciptome) to transcript names " "(used by --filename-gtf) [%default]") parser.add_option( "-s", "--filename-stats", dest="filename_stats", type="string", help="filename to output stats to [%default]") parser.add_option( "-o", "--colour", dest="colour_mismatches", action="store_true", help="mismatches will use colour differences (CM tag) [%default]") parser.add_option( "-i", "--ignore-mismatches", dest="ignore_mismatches", action="store_true", help="ignore mismatches [%default]") parser.add_option( "-c", "--remove-contigs", dest="remove_contigs", type="string", help="','-separated list of contigs to remove [%default]") parser.add_option( "-f", "--force-output", dest="force", action="store_true", help="force overwriting of existing files [%default]") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="remove reads not matching uniquely [%default]") parser.add_option("--output-sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.set_defaults( filename_gtf=None, filename_mismapped=None, filename_junctions=None, filename_transcriptome=None, filename_map=None, remove_contigs=None, force=False, unique=False, colour_mismatches=False, ignore_mismatches=False, output_sam=False, filename_table=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) != 1: raise ValueError("please supply one bam file") bamfile_genome = args[0] genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb") if options.remove_contigs: options.remove_contigs = options.remove_contigs.split(",") if options.filename_map: E.info("reading map") id_map = IOTools.read_map( IOTools.open_file(options.filename_map), has_header=True) id_map = dict([(y, x) for x, y in id_map.items()]) else: id_map = None transcripts = {} if options.filename_gtf: E.info("indexing geneset") mapped, missed = 0, 0 for gtf in GTF.transcript_iterator( GTF.iterator(IOTools.open_file(options.filename_gtf))): gtf.sort(key=lambda x: x.start) transcript_id = gtf[0].transcript_id if id_map: try: transcript_id = id_map[transcript_id] mapped += 1 except KeyError: missed += 1 continue transcripts[transcript_id] = gtf E.info("read %i transcripts from geneset (%i mapped, %i missed)" % (len(transcripts), mapped, missed)) regions_to_remove = None if options.filename_regions: E.info("indexing regions") regions_to_remove = IndexedGenome.Simple() for bed in Bed.iterator(IOTools.open_file(options.filename_regions)): regions_to_remove.add(bed.contig, bed.start, bed.end) E.info("read %i regions" % len(regions_to_remove)) if options.filename_transcriptome: transcripts_samfile = pysam.AlignmentFile(options.filename_transcriptome, "rb") else: transcripts_samfile = None if options.output_sam: output_samfile = pysam.AlignmentFile("-", "wh", template=genome_samfile) else: output_samfile = pysam.AlignmentFile("-", "wb", template=genome_samfile) if options.filename_mismapped: if not options.force and os.path.exists(options.filename_mismapped): raise IOError("output file %s already exists" % options.filename_mismapped) output_mismapped = pysam.AlignmentFile(options.filename_mismapped, "wb", template=genome_samfile) else: output_mismapped = None if options.filename_junctions: junctions_samfile = pysam.AlignmentFile(options.filename_junctions, "rb") else: junctions_samfile = None c = bams2bam_filter(genome_samfile, output_samfile, output_mismapped, transcripts_samfile, junctions_samfile, transcripts, regions=regions_to_remove, unique=options.unique, remove_contigs=options.remove_contigs, colour_mismatches=options.colour_mismatches, ignore_mismatches=options.ignore_mismatches, ignore_transcripts=transcripts_samfile is None, ignore_junctions=junctions_samfile is None) if options.filename_stats: outf = IOTools.open_file(options.filename_stats, "w") outf.write("category\tcounts\n%s\n" % c.asTable()) outf.close() if options.filename_transcriptome: transcripts_samfile.close() genome_samfile.close() output_samfile.close() if output_mismapped: output_mismapped.close() # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("--bed-file", dest="infiles", type="string", metavar="bed", help="supply list of bed files", action="append") parser.set_defaults(infiles=[]) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) options.infiles.extend(args) if len(options.infiles) == 0: raise ValueError('please provide at least 1 bed file') E.info("concatenating bed files") # concatenate the list of files tmp = tempfile.NamedTemporaryFile(delete=False, mode="w") tmp_merge = tempfile.NamedTemporaryFile(delete=False, mode="w") infs = options.infiles for inf in infs: for bed in Bed.iterator(IOTools.open_file(inf)): tmp.write("%s\n" % bed) tmp.close() E.info("merging bed entries") # merge the bed entries in the file name = tmp.name tmp_bed = pybedtools.BedTool(name) tmp_bed.sort().merge().saveas(tmp_merge.name) tmp_merge.close() E.info("indexing bed entries") # index the bed entries merged = IndexedGenome.Simple() for bed in Bed.iterator(IOTools.open_file(tmp_merge.name)): merged.add(bed.contig, bed.start, bed.end) counts = collections.defaultdict(int) # list of samples samples = options.infiles E.info("counting no. samples overlapping each interval") for sample in samples: found = set() for bed in Bed.iterator(IOTools.open_file(sample)): if merged.contains(bed.contig, bed.start, bed.end): key = [bed.contig] + \ [x for x in merged.get(bed.contig, bed.start, bed.end)] key = (key[0], key[1][0], key[1][1]) if key in found: continue found.add(key) # tuple of interval description as key - (contig, start, end) counts[key] += 1 # open outfile options.stdout.write("contig\tstart\tend\tcount\n") E.info("outputting result") for interval, count in sorted(counts.items()): options.stdout.write("\t".join(map(str, interval)) + "\t" + str(count) + "\n") # write footer and output benchmark information. E.stop()
def annotateWindows(contig, windows, gff_data, fasta, options): """annotate windows.""" index = IndexedGenome.IndexedGenome() for g in gff_data: index.add(g.contig, g.start, g.end, g) w = GTF.Entry() w.contig = contig w.feature = "count" is_gtf = options.is_gtf if options.transform == "none": transform = lambda x, y, z: map(lambda x: (x[0], x[1]), z) elif options.transform == "overlap": transform = transform_overlap elif options.transform == "complement": transform = transform_complement elif options.transform == "third_codon": transform = transform_third_codon else: raise ValueError("unknown transform %s" % options.transform) work_on_intervals = True if options.decorator == "counts": decorator = decorator_counts elif options.decorator == "mean-length": decorator = decorator_mean_length elif options.decorator == "median-length": decorator = decorator_median_length elif options.decorator == "percent-coverage": decorator = decorator_percent_coverage elif options.decorator == "gc": decorator = decorator_percent_gc elif options.decorator == "median-score": decorator = decorator_median_score work_on_intervals = False elif options.decorator == "mean-score": decorator = decorator_mean_score work_on_intervals = False elif options.decorator == "stddev-score": decorator = decorator_stddev_score work_on_intervals = False elif options.decorator == "min-score": decorator = decorator_min_score work_on_intervals = False elif options.decorator == "max-score": decorator = decorator_max_score work_on_intervals = False else: raise ValueError("unknown decorator %s" % options.decorator) for start, end in windows: # counts/length before/after transformation n1, l1, n2, l2 = 0, 0, 0, 0 values, intervals_with_gff, genes, transcripts = [], [], set(), set() try: for istart, iend, value in index.get(contig, start, end): n1 += 1 l1 += iend - istart intervals_with_gff.append((istart, iend, value)) values.append(value.score) if is_gtf: genes.add(value.gene_id) transcripts.add(value.mTransciptId) except KeyError: pass if n1 == 0 and options.skip_empty: continue if work_on_intervals: if options.loglevel >= 3: options.stdlog.write( "# intervals in window %i:%i before transformation: %s\n" % (start, end, str(intervals))) intervals = transform(start, end, intervals_with_gff) for xstart, xend in intervals: n2 += 1 l2 += xend - xstart if options.loglevel >= 3: options.stdlog.write( "# intervals in window %i:%i after transformation: %s\n" % (start, end, str(intervals))) w.score, extra_info = decorator(intervals, start, end, contig, fasta) else: if len(values) > 0: values = map(float, values) w.score, extra_info = decorator(values, start, end, contig) else: w.score, extra_info = 0, None l2 = 0 n2 = 0 w.start = start w.end = end w.clearAttributes() w.addAttribute("n1", n1) w.addAttribute("l1", l1) w.addAttribute("n2", n2) w.addAttribute("l2", l2) if extra_info: w.addAttribute("extra", extra_info) options.stdout.write(str(w) + "\n")