def testIteratorUncompressed(self): '''test iteration from uncompressed file.''' tmpfilename = 'tmp_testIteratorUncompressed' infile = gzip.open(self.filename, "rb") outfile = open(tmpfilename, "wb") outfile.write(infile.read()) outfile.close() infile.close() with open(tmpfilename) as infile: for x, r in enumerate(pysam.tabix_iterator(infile, pysam.asTuple())): self.assertEqual(self.compare[x], list(r)) self.assertEqual(len(self.compare[x]), len(r)) # test indexing for c in range(0, len(r)): self.assertEqual(self.compare[x][c], r[c]) # test slicing access for c in range(0, len(r) - 1): for cc in range(c + 1, len(r)): self.assertEqual(self.compare[x][c:cc], r[c:cc]) os.unlink(tmpfilename)
def testIteratorUncompressed(self): '''test iteration from uncompressed file.''' tmpfilename = 'tmp_testIteratorUncompressed' infile = gzip.open(self.filename, "rb") outfile = open(tmpfilename, "wb") outfile.write(infile.read()) outfile.close() infile.close() with open(tmpfilename) as infile: for x, r in enumerate(pysam.tabix_iterator( infile, pysam.asTuple())): self.assertEqual(self.compare[x], list(r)) self.assertEqual(len(self.compare[x]), len(r)) # test indexing for c in range(0, len(r)): self.assertEqual(self.compare[x][c], r[c]) # test slicing access for c in range(0, len(r) - 1): for cc in range(c + 1, len(r)): self.assertEqual(self.compare[x][c:cc], r[c:cc]) os.unlink(tmpfilename)
def filter_bam(args, bcd): with open(args.output, 'w') as o: with gzip.open(args.fragments) as f: tbx = pysam.tabix_iterator(f, pysam.asBed()) for line in tbx: if line.name in bcd: o.write("{}\n".format(str(line))) return 0
def filter_bam(args,bcd): reads = {} replicate = {} with gzip.open(args.fragments) as f: tbx = pysam.tabix_iterator(f,pysam.asBed()) for line in tbx: if line.name in bcd: try: reads[bcd[line.name] + "_rep" + line.name.split("_")[-2]].append(str(line)) except KeyError: reads[bcd[line.name] + "_rep" + line.name.split("_")[-2]] = [str(line)] return(reads)
def testIteratorCompressed(self): """test iteration from compressed file.""" with gzip.open(self.filename) as infile: for x, r in enumerate(pysam.tabix_iterator(infile, pysam.asTuple())): self.assertEqual(self.compare[x], list(r)) self.assertEqual(len(self.compare[x]), len(r)) # test indexing for c in range(0, len(r)): self.assertEqual(self.compare[x][c], r[c]) # test slicing access for c in range(0, len(r) - 1): for cc in range(c + 1, len(r)): self.assertEqual(self.compare[x][c:cc], r[c:cc])
def testIteratorCompressed(self): '''test iteration from compressed file.''' with gzip.open(self.filename) as infile: for x, r in enumerate(pysam.tabix_iterator(infile, pysam.asTuple())): self.assertEqual(self.compare[x], list(r)) self.assertEqual(len(self.compare[x]), len(r)) # test indexing for c in range(0, len(r)): self.assertEqual(self.compare[x][c], r[c]) # test slicing access for c in range(0, len(r) - 1): for cc in range(c + 1, len(r)): self.assertEqual(self.compare[x][c:cc], r[c:cc])
def GetSumOfDifferencesFromTheReference(vcfpath): from subprocess import check_call from utilBMF.HTSUtils import TrimExt import pysam import numpy as np from sys import stderr from itertools import chain cfi = chain.from_iterable bgvcfpath = TrimExt(vcfpath) + ".gz" check_call("bgzip -c %s > %s" % (vcfpath, bgvcfpath), shell=True) stderr.write("bgvcf now at %s" % bgvcfpath) tabixstr = "tabix " + bgvcfpath stderr.write("Now calling tabixstr: '%s'" % tabixstr) check_call("tabix %s" % bgvcfpath, shell=True) infh = open(bgvcfpath, "rb") tabixhandle = pysam.tabix_iterator(infh, pysam.asVCF()) return np.sum(np.array(list(cfi([dict(tup.split("=") for tup in i.info.split(";"))[ 'I16'].split(",")[2:4] for i in tabixhandle if "INDEL" not in i.info])), dtype=np.int64))
def main(): logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('--frac', type=float, default=0.0) parser.add_argument('gtf_file') args = parser.parse_args() all_t_ids = set() t_ids = set() for f in pysam.tabix_iterator(open(args.gtf_file), pysam.asGTF()): if f.feature == 'transcript': t_id = f.transcript_id frac = float(f.frac) keep = (frac >= args.frac) all_t_ids.add(t_id) if keep: t_ids.add(t_id) print str(f) elif f.feature == 'exon': t_id = f.transcript_id assert t_id in all_t_ids if t_id in t_ids: print str(f)
def iterator(infile): """return a simple iterator over all entries in a file.""" return pysam.tabix_iterator(infile, pysam.asGTF())
def main(argv=sys.argv): # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("-s", "--session", dest="session", type=str, help="load session before creating plots ") parser.add_argument("-d", "--snapshot-dir", dest="snapshotdir", type=str, help="directory to save snapshots in ") parser.add_argument("-f", "--format", dest="format", type=str, choices=("png", "eps", "svg"), help="output file format ") parser.add_argument("-o", "--host", dest="host", type=str, help="host that IGV is running on ") parser.add_argument("-p", "--port", dest="port", type=int, help="port that IGV listens at ") parser.add_argument("-e", "--extend", dest="extend", type=int, help="extend each interval by a number of bases ") parser.add_argument("-x", "--expand", dest="expand", type=float, help="expand each region by a certain factor ") parser.add_argument("--session-only", dest="session_only", action="store_true", help="plot session after opening, " "ignore intervals ") parser.add_argument("-n", "--name", dest="name", type=str, choices=("bed-name", "increment"), help="name to use for snapshot ") parser.set_defaults( command="igv.sh", host='127.0.0.1', port=61111, snapshotdir=os.getcwd(), extend=0, format="png", expand=1.0, session=None, session_only=False, keep_open=False, name="bed-name", ) # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv, add_output_options=True) igv_process = None if args.new_instance: E.info("starting new IGV process") igv_process = IGV.startIGV(command=args.command, port=args.port) E.info("new IGV process started") E.info("connection to process on %s:%s" % (args.host, args.port)) E.info("saving images in %s" % args.snapshotdir) igv = IGV(host=args.host, port=args.port, snapshot_dir=os.path.abspath(args.snapshotdir)) if args.session: E.info('loading session from %s' % args.session) igv.load(args.session) E.info('loaded session') if args.session_only: E.info('plotting session only ignoring any intervals') fn = "%s.%s" % (os.path.basename(args.session), args.format) E.info("writing snapshot to '%s'" % os.path.join(args.snapshotdir, fn)) igv.save(fn) else: c = E.Counter() for bed in pysam.tabix_iterator(args.stdin, parser=pysam.asBed()): c.input += 1 # IGV can not deal with white-space in filenames if args.name == "bed-name": name = re.sub("\s", "_", bed.name) elif args.name == "increment": name = str(c.input) E.info("going to %s:%i-%i for %s" % (bed.contig, bed.start, bed.end, name)) start, end = bed.start, bed.end extend = args.extend if args.expand: d = end - start extend = max(extend, (args.expand * d - d) // 2) start -= extend end += extend igv.go("%s:%i-%i" % (bed.contig, start, end)) fn = E.get_output_file("%s.%s" % (name, args.format)) E.info("writing snapshot to '%s'" % fn) igv.save(fn) c.snapshots += 1 E.info(c) if igv_process is not None and not args.keep_open: E.info('shutting down IGV') igv_process.send_signal(signal.SIGKILL) E.stop()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-b", "--reference-bed-file", dest="reference_bed_file", type="string", help="reference bed file " "[%default]") parser.add_option("-m", "--method", dest="method", type="choice", choices=("lvc-comparison", ), help="methods to apply [%default]") parser.set_defaults(method="lvc-comparison", reference_fasta_file=None, input_bed_file=None, size_bins=(1000, 10000, 100000), output_sets=True, region_string=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) reference_set = collections.defaultdict(quicksect.IntervalTree) E.info("reading reference bed file from {}".format( options.reference_bed_file)) with IOTools.open_file(options.reference_bed_file) as inf: for record in pysam.tabix_iterator(inf, pysam.asBed()): mm = reference_set[record.contig] mm.add(record.start, record.end) E.info("read reference intervals on {} contigs: {}".format( len(list(reference_set.keys())), ",".join(list(reference_set.keys())))) if options.output_sets: output_tp = E.open_output_file("tp") output_fp = E.open_output_file("fp") output_fn = E.open_output_file("fn") else: output_tp = None output_fp = None output_fn = None if options.method == "lvc-comparison": c = E.Counter() found = set() counts = {} names = set() nsize_bins = len(options.size_bins) for bin in range(len(options.size_bins) + 1): counts[bin] = dict([(x, collections.defaultdict(int)) for x in ("tp", "fn", "fp", "test", "truth")]) for record in pysam.tabix_iterator(options.stdin, pysam.asBed()): if record.contig not in reference_set: c.ignored_no_contig += 1 continue c.test += 1 matches = reference_set[record.contig].search( record.start, record.end) size = record.end - record.start bin = get_size_bin(size, options.size_bins) if len(matches) == 0: c.fp += 1 status = "fp" if output_fp: output_fp.write(str(record) + "\n") elif len(matches) >= 1: c.tp += 1 status = "tp" if output_tp: output_tp.write(str(record) + "\n") # todo: overlap criteria # record found for match in matches: found.add((record.contig, match.start, match.end)) name = record.name.split(",")[0] names.add(name) counts[bin]["test"][name] += 1 counts[bin][status][name] += 1 outf = options.stdout with IOTools.open_file(options.reference_bed_file) as inf: for record in pysam.tabix_iterator(inf, pysam.asBed()): c.truth += 1 bin = get_size_bin(record.end - record.start, options.size_bins) counts[bin]["truth"]["all"] += 1 key = (record.contig, record.start, record.end) if key not in found: c.fn += 1 counts[bin]["fn"]["all"] += 1 outf.write("\t".join(("category", "size", "test", "tp", "fp", "truth", "fn")) + "\n") for name in sorted(names): for bin in range(len(options.size_bins) + 1): if bin == len(options.size_bins): size_bin = ">={}".format(options.size_bins[-1]) else: size_bin = "<{}".format(options.size_bins[bin]) outf.write("\t".join( map(str, ( name, size_bin, counts[bin]["test"][name], counts[bin]["tp"][name], counts[bin]["fp"][name], counts[bin]["truth"]["all"], counts[bin]["fn"]["all"], ))) + "\n") E.info(str(c)) E.stop()
def _aggregate_gtf(gtf_file, sample_id, gtf_expr_attr, output_fh, stats_fh, is_ref=False): def _init_t_dict(): return {'_id': None, 'num_exons': 0, 'length': 0} t_dict = collections.defaultdict(_init_t_dict) cur_t_id = 1 exprs = [] for f in pysam.tabix_iterator(open(gtf_file), pysam.asGTF()): if f.feature == 'transcript': t_id = f.transcript_id if t_id in t_dict: m = 'GTF "%s" transcript_id "%s" not unique' % (gtf_file, t_id) raise GTFError(m) t_item = t_dict[t_id] # rename transcript id new_t_id = "%s.T%d" % (sample_id, cur_t_id) cur_t_id += 1 t_item['_id'] = new_t_id if is_ref: expr = 0.0 else: expr = float(f[gtf_expr_attr]) exprs.append(expr) # prepare attributes attrs = {GTF.Attr.TRANSCRIPT_ID: new_t_id, GTF.Attr.SAMPLE_ID: sample_id, GTF.Attr.REF: str(int(is_ref)), GTF.Attr.EXPR: str(expr)} # save attributes f.fromDict(attrs) print >>output_fh, str(f) elif f.feature == 'exon': t_id = f.transcript_id t_item = t_dict[t_id] # update statistics t_item['num_exons'] += 1 t_item['length'] += (f.end - f.start) # replace transcript id f.fromDict({GTF.Attr.TRANSCRIPT_ID: t_item['_id']}) print >>output_fh, str(f) # process statistics num_exons = [] lengths = [] for t_item in t_dict.itervalues(): lengths.append(t_item['length']) num_exons.append(t_item['num_exons']) # compute and write stats quantiles = range(0, 101) expr_qs = (scoreatpercentile(exprs, q) for q in quantiles) expr_qs = ','.join(map(str, expr_qs)) length_qs = (int(round(scoreatpercentile(lengths, q))) for q in quantiles) length_qs = ','.join(map(str, length_qs)) num_exon_qs = (int(round(scoreatpercentile(num_exons, q))) for q in quantiles) num_exon_qs = ','.join(map(str, num_exon_qs)) fields = [sample_id, len(t_dict), expr_qs, length_qs, num_exon_qs] print >>stats_fh, '\t'.join(map(str, fields))
def iterate_parsed_uncompressed(fn): with open(fn) as f: return len(list(pysam.tabix_iterator(f, parser=pysam.asBed())))
def test_iterator_parsed_compressed(): f = gzip.open(fn_compressed) l = len( list( pysam.tabix_iterator( f, parser = pysam.asBed() )))
def test_iterator_parsed_uncompressed(): f = open("windows_small.bed") l = len(list(pysam.tabix_iterator(f, parser=pysam.asBed())))
def readFromFile(infile): """read records from file and return as list.""" result = [] for gff in pysam.tabix_iterator(infile, pysam.asGTF()): result.append(gff) return result
def test_iterator_parsed_compressed(): f = gzip.open(fn_compressed) l = len(list(pysam.tabix_iterator(f, parser=pysam.asBed())))
def readFromFile( infile ): """read gtf from file.""" result = [] for gff in pysam.tabix_iterator( infile, pysam.asGTF() ): result.append( gff ) return result
def readFromFile(infile): """read gtf from file.""" result = [] for gff in pysam.tabix_iterator(infile, pysam.asGTF()): result.append(gff) return result
def test_iterator_parsed_uncompressed(): f = open("windows_small.bed") l = len( list( pysam.tabix_iterator( f, parser = pysam.asBed() )))
def main(argv=None): ''' main function ''' if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-g", "--is-gff3", dest="gff3_input", action="store_true", help="filename in gff3 format" "[default=%default].") parser.add_option("-o", "--output-only-attributes", dest="only_attributes", action="store_true", help="output only attributes as separate columns " "[default=%default].") parser.add_option("-f", "--attributes-as-columns", dest="output_full", action="store_true", help="output attributes as separate columns " "[default=%default].") parser.add_option("-i", "--invert", dest="invert", action="store_true", help="convert tab-separated table back to gtf " "[default=%default].") parser.add_option("-m", "--output-map", dest="output_map", type="choice", choices=("transcript2gene", "peptide2gene", "peptide2transcript"), help="output a map mapping transcripts to genes " "[default=%default].") parser.set_defaults(only_attributes=False, output_full=False, invert=False, output_map=None, gff3_input=False) (options, args) = E.Start(parser, argv=argv) if options.output_full: # output full table with column for each attribute # to specify gff3 format if options.gff3_input is True: gff = pysam.tabix_iterator(options.stdin, parser=pysam.asGFF3()) attributes = set() data = [] for line in gff: # get keys to write out to header data.append(line) attributes = attributes.union(set(line.keys())) attributes = sorted(list(attributes)) header = [ "contig", "source", "feature", "start", "end", "score", "strand", "frame" ] + attributes options.stdout.write("\t".join(header) + "\n") for gff3 in data: for a in header: val = getattr(gff3, a) options.stdout.write("%s\t" % (val)) options.stdout.write("\n") else: attributes = set() data = [] for gtf in GTF.iterator(options.stdin): data.append(gtf) attributes = attributes.union(set(gtf.keys())) # remove gene_id and transcript_id, as they are used # explicitely later attributes.difference_update(["gene_id", "transcript_id"]) attributes = sorted(list(attributes)) if options.only_attributes: header = ["gene_id", "transcript_id"] + attributes else: header = [ "contig", "source", "feature", "start", "end", "score", "strand", "frame", "gene_id", "transcript_id", ] + attributes options.stdout.write("\t".join(header) + "\n") if options.only_attributes: for gtf in data: options.stdout.write("\t".join( map(str, ( gtf.gene_id, gtf.transcript_id, )))) for a in attributes: if a in ("gene_id", "transcript_id"): continue try: val = getattr(gtf, a) except AttributeError: val = "" except KeyError: val = "" options.stdout.write("\t%s" % val) options.stdout.write("\n") else: for gtf in data: options.stdout.write("\t".join( map(str, ( gtf.contig, gtf.source, gtf.feature, gtf.start, gtf.end, gtf.score, gtf.strand, gtf.frame, gtf.gene_id, gtf.transcript_id, )))) for a in attributes: try: val = getattr(gtf, a) except AttributeError: val = "" options.stdout.write("\t%s" % val) options.stdout.write("\n") elif options.invert: gtf = GTF.Entry() header = None for line in options.stdin: if line.startswith("#"): continue data = line[:-1].split("\t") if not header: header = data map_header2column = dict([(y, x) for x, y in enumerate(header)]) continue # fill gtf entry with data try: gtf.contig = data[map_header2column["contig"]] gtf.source = data[map_header2column["source"]] gtf.feature = data[map_header2column["feature"]] # subtract -1 to start for 0-based coordinates gtf.start = int(data[map_header2column["start"]]) gtf.end = int(data[map_header2column["end"]]) gtf.score = data[map_header2column["score"]] gtf.strand = data[map_header2column["strand"]] gtf.frame = data[map_header2column["frame"]] gtf.gene_id = data[map_header2column["gene_id"]] gtf.transcript_id = data[map_header2column["transcript_id"]] gtf.parseInfo(data[map_header2column["attributes"]], line) except KeyError as msg: raise KeyError("incomplete entry %s: %s: %s" % (str(data), str(map_header2column), msg)) # output gtf entry in gtf format options.stdout.write("%s\n" % str(gtf)) elif options.output_map: if options.output_map == "transcript2gene": fr = lambda x: x.transcript_id to = lambda x: x.gene_id options.stdout.write("transcript_id\tgene_id\n") elif options.output_map == "peptide2gene": fr = lambda x: x.protein_id to = lambda x: x.gene_id options.stdout.write("peptide_id\tgene_id\n") elif options.output_map == "peptide2transcript": fr = lambda x: x.protein_id to = lambda x: x.transcript_id options.stdout.write("peptide_id\ttranscript_id\n") map_fr2to = {} for gtf in GTF.iterator(options.stdin): try: map_fr2to[fr(gtf)] = to(gtf) except AttributeError: pass for x, y in sorted(map_fr2to.items()): options.stdout.write("%s\t%s\n" % (x, y)) else: header = ("contig", "source", "feature", "start", "end", "score", "strand", "frame", "gene_id", "transcript_id", "attributes") options.stdout.write("\t".join(header) + "\n") for gtf in GTF.iterator(options.stdin): attributes = [] for a in list(gtf.keys()): if a in ("gene_id", "transcript_id"): continue attributes.append('%s %s' % (a, GTF.quote(gtf[a]))) attributes = "; ".join(attributes) options.stdout.write("\t".join( map(str, ( gtf.contig, gtf.source, gtf.feature, gtf.start, gtf.end, GTF.toDot(gtf.score), gtf.strand, gtf.frame, gtf.gene_id, gtf.transcript_id, attributes, ))) + "\n") E.Stop()
def __init__(self, infile, *args, **kwargs): self.gff = pysam.tabix_iterator(iotools.open_file(infile), parser=pysam.asGFF3())
def _aggregate_gtf(gtf_file, sample_id, gtf_expr_attr, output_fh, stats_fh, is_ref=False): def _init_t_dict(): return {'_id': None, 'num_exons': 0, 'length': 0} t_dict = collections.defaultdict(_init_t_dict) cur_t_id = 1 exprs = [] for f in pysam.tabix_iterator(open(gtf_file), pysam.asGTF()): if f.feature == 'transcript': t_id = f.transcript_id if t_id in t_dict: m = 'GTF "%s" transcript_id "%s" not unique' % (gtf_file, t_id) raise GTFError(m) t_item = t_dict[t_id] # rename transcript id new_t_id = "%s.T%d" % (sample_id, cur_t_id) cur_t_id += 1 t_item['_id'] = new_t_id if is_ref: expr = 0.0 else: expr = float(f[gtf_expr_attr]) exprs.append(expr) # prepare attributes attrs = { GTF.Attr.TRANSCRIPT_ID: new_t_id, GTF.Attr.SAMPLE_ID: sample_id, GTF.Attr.REF: str(int(is_ref)), GTF.Attr.EXPR: str(expr) } # save attributes f.fromDict(attrs) print >> output_fh, str(f) elif f.feature == 'exon': t_id = f.transcript_id t_item = t_dict[t_id] # update statistics t_item['num_exons'] += 1 t_item['length'] += (f.end - f.start) # replace transcript id f.fromDict({GTF.Attr.TRANSCRIPT_ID: t_item['_id']}) print >> output_fh, str(f) # process statistics num_exons = [] lengths = [] for t_item in t_dict.itervalues(): lengths.append(t_item['length']) num_exons.append(t_item['num_exons']) # compute and write stats quantiles = range(0, 101) expr_qs = (scoreatpercentile(exprs, q) for q in quantiles) expr_qs = ','.join(map(str, expr_qs)) length_qs = (int(round(scoreatpercentile(lengths, q))) for q in quantiles) length_qs = ','.join(map(str, length_qs)) num_exon_qs = (int(round(scoreatpercentile(num_exons, q))) for q in quantiles) num_exon_qs = ','.join(map(str, num_exon_qs)) fields = [sample_id, len(t_dict), expr_qs, length_qs, num_exon_qs] print >> stats_fh, '\t'.join(map(str, fields))
def get_bed_dict(refdict, bedfh): beddict = {chrom: np.zeros(len(refdict[chrom]), dtype = np.bool) for chrom in refdict.keys()} for bedline in pysam.tabix_iterator(bedfh, parser = pysam.asBed()): beddict[bedline.contig][bedline.start:bedline.end] = True #end is 1 past the actual end, so this slice should work properly return beddict
def __init__(self, infile, *args, **kwargs): self.gff = pysam.tabix_iterator(IOTools.openFile(infile), parser=pysam.asGFF3())