def testRead(self): for x, r in enumerate(self.tabix.fetch(parser=pysam.asGFF3())): c = self.compare[x] self.assertEqual(len(c), len(r)) self.assertEqual(list(c), list(r)) self.assertEqual(c, str(r).split("\t")) self.assertEqual(c[0], r.contig) self.assertEqual("\t".join(map(str, c)), str(r)) self.assertTrue(r.ID.startswith("MI00"))
def testSetting(self): for r in self.tabix.fetch(parser=pysam.asGFF3()): r.contig = r.contig + "_test_contig" r.source = r.source + "_test_source" r.feature = r.feature + "_test_feature" r.start += 10 r.end += 10 r.score = 20 r.strand = "+" r.frame = 0 r.ID = "test" sr = str(r) self.assertTrue("_test_contig" in sr) self.assertTrue("_test_source" in sr) self.assertTrue("_test_feature" in sr) self.assertTrue("ID=test" in sr)
def main(argv=None): ''' main function ''' if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-g", "--is-gff3", dest="gff3_input", action="store_true", help="filename in gff3 format" "[default=%default].") parser.add_option("-o", "--output-only-attributes", dest="only_attributes", action="store_true", help="output only attributes as separate columns " "[default=%default].") parser.add_option("-f", "--attributes-as-columns", dest="output_full", action="store_true", help="output attributes as separate columns " "[default=%default].") parser.add_option("-i", "--invert", dest="invert", action="store_true", help="convert tab-separated table back to gtf " "[default=%default].") parser.add_option("-m", "--output-map", dest="output_map", type="choice", choices=("transcript2gene", "peptide2gene", "peptide2transcript"), help="output a map mapping transcripts to genes " "[default=%default].") parser.set_defaults(only_attributes=False, output_full=False, invert=False, output_map=None, gff3_input=False) (options, args) = E.Start(parser, argv=argv) if options.output_full: # output full table with column for each attribute # to specify gff3 format if options.gff3_input is True: gff = pysam.tabix_iterator(options.stdin, parser=pysam.asGFF3()) attributes = set() data = [] for line in gff: # get keys to write out to header data.append(line) attributes = attributes.union(set(line.keys())) attributes = sorted(list(attributes)) header = [ "contig", "source", "feature", "start", "end", "score", "strand", "frame" ] + attributes options.stdout.write("\t".join(header) + "\n") for gff3 in data: for a in header: val = getattr(gff3, a) options.stdout.write("%s\t" % (val)) options.stdout.write("\n") else: attributes = set() data = [] for gtf in GTF.iterator(options.stdin): data.append(gtf) attributes = attributes.union(set(gtf.keys())) # remove gene_id and transcript_id, as they are used # explicitely later attributes.difference_update(["gene_id", "transcript_id"]) attributes = sorted(list(attributes)) if options.only_attributes: header = ["gene_id", "transcript_id"] + attributes else: header = [ "contig", "source", "feature", "start", "end", "score", "strand", "frame", "gene_id", "transcript_id", ] + attributes options.stdout.write("\t".join(header) + "\n") if options.only_attributes: for gtf in data: options.stdout.write("\t".join( map(str, ( gtf.gene_id, gtf.transcript_id, )))) for a in attributes: if a in ("gene_id", "transcript_id"): continue try: val = getattr(gtf, a) except AttributeError: val = "" except KeyError: val = "" options.stdout.write("\t%s" % val) options.stdout.write("\n") else: for gtf in data: options.stdout.write("\t".join( map(str, ( gtf.contig, gtf.source, gtf.feature, gtf.start, gtf.end, gtf.score, gtf.strand, gtf.frame, gtf.gene_id, gtf.transcript_id, )))) for a in attributes: try: val = getattr(gtf, a) except AttributeError: val = "" options.stdout.write("\t%s" % val) options.stdout.write("\n") elif options.invert: gtf = GTF.Entry() header = None for line in options.stdin: if line.startswith("#"): continue data = line[:-1].split("\t") if not header: header = data map_header2column = dict([(y, x) for x, y in enumerate(header)]) continue # fill gtf entry with data try: gtf.contig = data[map_header2column["contig"]] gtf.source = data[map_header2column["source"]] gtf.feature = data[map_header2column["feature"]] # subtract -1 to start for 0-based coordinates gtf.start = int(data[map_header2column["start"]]) gtf.end = int(data[map_header2column["end"]]) gtf.score = data[map_header2column["score"]] gtf.strand = data[map_header2column["strand"]] gtf.frame = data[map_header2column["frame"]] gtf.gene_id = data[map_header2column["gene_id"]] gtf.transcript_id = data[map_header2column["transcript_id"]] gtf.parseInfo(data[map_header2column["attributes"]], line) except KeyError as msg: raise KeyError("incomplete entry %s: %s: %s" % (str(data), str(map_header2column), msg)) # output gtf entry in gtf format options.stdout.write("%s\n" % str(gtf)) elif options.output_map: if options.output_map == "transcript2gene": fr = lambda x: x.transcript_id to = lambda x: x.gene_id options.stdout.write("transcript_id\tgene_id\n") elif options.output_map == "peptide2gene": fr = lambda x: x.protein_id to = lambda x: x.gene_id options.stdout.write("peptide_id\tgene_id\n") elif options.output_map == "peptide2transcript": fr = lambda x: x.protein_id to = lambda x: x.transcript_id options.stdout.write("peptide_id\ttranscript_id\n") map_fr2to = {} for gtf in GTF.iterator(options.stdin): try: map_fr2to[fr(gtf)] = to(gtf) except AttributeError: pass for x, y in sorted(map_fr2to.items()): options.stdout.write("%s\t%s\n" % (x, y)) else: header = ("contig", "source", "feature", "start", "end", "score", "strand", "frame", "gene_id", "transcript_id", "attributes") options.stdout.write("\t".join(header) + "\n") for gtf in GTF.iterator(options.stdin): attributes = [] for a in list(gtf.keys()): if a in ("gene_id", "transcript_id"): continue attributes.append('%s %s' % (a, GTF.quote(gtf[a]))) attributes = "; ".join(attributes) options.stdout.write("\t".join( map(str, ( gtf.contig, gtf.source, gtf.feature, gtf.start, gtf.end, GTF.toDot(gtf.score), gtf.strand, gtf.frame, gtf.gene_id, gtf.transcript_id, attributes, ))) + "\n") E.Stop()
def open_gff3(filename: str) -> Iterator[Any]: parser = pysam.asGFF3() with open(filename, "rb") as handle: for line in handle: if not line.startswith(b"#"): yield parser(line, len(line))
def __init__(self, infile, *args, **kwargs): self.gff = pysam.tabix_iterator(iotools.open_file(infile), parser=pysam.asGFF3())
"gff", help="positional input, must be a tabix gff file with tbi") parser.add_argument("outbam", help="positional output, must be bam") parser.add_argument("pkl", help="output pkl file") parser.add_argument("-n", "--np", help="minimum number of passes to mark", type=int, default=5) parser.add_argument('-d', help="store args.d as true if -d", action="store_true", default=False) args = parser.parse_args() bam = pysam.AlignmentFile(args.ccs, check_sq=False) gff = pysam.TabixFile(args.gff, parser=pysam.asGFF3()) obam = pysam.AlignmentFile(args.outbam, "wb", template=bam) contigs = set(gff.contigs) tmp_d = { "name": [], "np": [], "length": [], "positions": [], "qualities": [] } for idx, aln in enumerate(bam.fetch(until_eof=True)): if ((aln.query_name in contigs) and (aln.get_tag("np") >= args.np)): new_tags = get_mod_tags(gff, aln.query_name, aln) if (new_tags is not None): aln.set_tag("MM", new_tags[0])
def __init__(self, infile, *args, **kwargs): self.gff = pysam.tabix_iterator(IOTools.openFile(infile), parser=pysam.asGFF3())