def fetchProbeFragments(probe_bed, digest_bed, outfile, lookup_out): digest_fragments = pysam.TabixFile(digest_bed) bed = Bed.Bed() with IOTools.openFile(outfile, "w") as outf, \ IOTools.openFile(lookup_out,"w") as lookup: lookup.write("probe\tfragment\n") for probe in Bed.iterator(IOTools.openFile(probe_bed)): frag = digest_fragments.fetch(probe.contig, probe.start, probe.end, parser=pysam.asBed()) frag = list(frag) if not len(frag) == 1: E.warn("%i fragments found for probe %s, skipping" % (len(frag), probe.name)) continue frag = frag[0] bed.start = frag.start bed.end = frag.end bed.contig = frag.contig bed["name"] = probe.name bed["score"] = "." bed["strand"] = "+" lookup.write("%s\t%s\n" % (probe.name, frag.name)) outf.write(str(bed) + "\n")
def bedsFromList(data): ''' takes a list of data and returns a bed object''' for interval in data: bed = Bed.Bed() try: bed.contig, bed.start, bed.end = \ interval[0], int(interval[1]), int(interval[2]) except IndexError: raise ValueError("Insufficient fields to generate bed entry") except ValueError: raise ValueError("Fields 2 and 3 must be integer") bed.fields = interval[3:] yield bed
def sites2fragments(infile, genomefile, outfile): '''Convert bedfile of deigestion sites into bedfile of fragments''' contig_lengths = { line.split()[0]: int(line.split()[1][:-1]) for line in IOTools.openFile(genomefile) } last_end = 0 last_contig = None name = 0 new_bed = Bed.Bed() new_bed["strand"] = "+" new_bed["score"] = "." with IOTools.openFile(outfile, "w") as outf: for bed in Bed.iterator(IOTools.openFile(infile)): if last_contig is not None and not bed.contig == last_contig: name += 1 new_bed.start = last_end new_bed.contig = last_contig new_bed.end = contig_lengths[bed.contig] new_bed["name"] = str(name) outf.write(str(new_bed) + "\n") last_end = 0 last_contig = bed.contig new_bed.contig = last_contig new_bed.start = last_end new_bed.end = bed.start name += 1 new_bed["name"] = str(name) outf.write(str(new_bed) + "\n") last_end = bed.end name += 1 new_bed.start = last_end new_bed.contig = last_contig new_bed.end = contig_lengths[bed.contig] new_bed["name"] = str(name) outf.write(str(new_bed) + "\n") pysam.tabix_index(outfile, force=True, preset="bed")
def transcript2bed12(transcript): new_entry = Bed.Bed() start = min(entry.start for entry in transcript) end = max(entry.end for entry in transcript) try: thickStart = min(entry.start for entry in transcript if entry.feature == "CDS") thickEnd = max(entry.end for entry in transcript if entry.feature == "CDS") except ValueError: # if there is no CDS, then set first base of transcript as # start if transcript[0].strand == "-": thickStart = end thickEnd = end else: thickStart = start thickEnd = start exons = GTF.asRanges(transcript, "exon") exon_starts = [es - start for (es, ee) in exons] exon_lengths = [ee - es for (es, ee) in exons] exon_count = len(exons) new_entry.contig = transcript[0].contig new_entry.start = start new_entry.end = end new_entry["strand"] = transcript[0].strand new_entry["name"] = transcript[0].transcript_id new_entry["thickStart"] = thickStart new_entry["thickEnd"] = thickEnd new_entry["blockCount"] = exon_count new_entry["blockStarts"] = ",".join(map(str, exon_starts)) new_entry["blockSizes"] = ",".join(map(str, exon_lengths)) return new_entry
def windows2bed12(windows, contig, strand, name, score): '''Convert a list of intervals into a single bed12 entry ''' windows = sorted(windows) entry = Bed.Bed() #if strand == "-": # windows = [(y+1, x+1) for x, y in windows] # windows = sorted(windows) #else: # windows = sorted(windows) entry.start = int(windows[0][0]) entry.end = int(windows[-1][1]) entry.contig = contig blockCount = int(len(windows)) blockSizes = ",".join( [str(int(window[1] - window[0])) for window in windows]) blockStarts = ",".join( [str(int(window[0] - windows[0][0])) for window in windows]) thickStart = int(entry.start) thickEnd = int(entry.end) itemRGB = "255,0,0" entry.fields = [ name, score, strand, thickStart, thickEnd, itemRGB, blockCount, blockSizes, blockStarts ] assert entry.end - entry.start > 0, "Malformed Bed entry entry size less than zero" assert all([blockSize > 0 for blockSize in map(int, blockSizes.split(","))]), \ "Malformed Bed entry, at least one block size less than zero" assert all([entry.start + blockStart <= entry.end for blockStart in map(int,blockStarts.split(","))]), \ "Malformed Bed entry: block start after end of entry" return entry
def getExonLocations(filename): '''return a list of exon locations as Bed entries from a file contain a one ensembl gene ID per line ''' fh = IOTools.openFile(filename, "r") ensembl_ids = [] for line in fh: ensembl_ids.append(line.strip()) fh.close() dbhandle = sqlite3.connect(PARAMS["annotations_database"]) cc = dbhandle.cursor() gene_ids = [] n_ids = 0 for ID in ensembl_ids: gene_ids.append('gene_id="%s"' % ID) n_ids += 1 statement = "select contig,start,end from geneset_cds_gtf where " + \ " OR ".join(gene_ids) cc.execute(statement) region_list = [] n_regions = 0 for result in cc: b = Bed.Bed() b.contig, b.start, b.end = result region_list.append(b) n_regions += 1 cc.close() E.info("Retrieved exon locations for %i genes. Got %i regions" % (n_ids, n_regions)) return(region_list)
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input file is in gtf format [default=%default] ") parser.add_option( "--set-name", dest="name", type="choice", help="field from the GFF/GTF file to use as the " "name field in the BED file [%default]", choices=("gene_id", "transcript_id", "class", "family", "feature", "source", "repName", "gene_biotype")) parser.add_option( "--track", dest="track", type="choice", choices=("feature", "source", None), help="use feature/source field to define BED tracks " "[default=%default]") parser.set_defaults( track=None, name="gene_id", is_gtf=False) (options, args) = E.Start(parser, add_pipe_options=True) ninput, noutput = 0, 0 iterator = GTF.iterator(options.stdin) if options.track: all_input = list(iterator) if options.track == "feature": grouper = lambda x: x.feature elif options.track == "source": grouper = lambda x: x.source all_input.sort(key=grouper) bed = Bed.Bed() for key, vals in itertools.groupby(all_input, grouper): options.stdout.write("track name=%s\n" % key) for gff in vals: ninput += 1 bed.fromGTF(gff, name=options.name) options.stdout.write(str(bed) + "\n") noutput += 1 else: bed = Bed.Bed() for gff in iterator: ninput += 1 bed.fromGTF(gff, name=options.name) options.stdout.write(str(bed) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i" % (ninput, noutput)) E.Stop()