示例#1
0
 def testRead(self):
     for x, r in enumerate(self.tabix.fetch(parser=pysam.asGFF3())):
         c = self.compare[x]
         self.assertEqual(len(c), len(r))
         self.assertEqual(list(c), list(r))
         self.assertEqual(c, str(r).split("\t"))
         self.assertEqual(c[0], r.contig)
         self.assertEqual("\t".join(map(str, c)), str(r))
         self.assertTrue(r.ID.startswith("MI00"))
示例#2
0
    def testSetting(self):

        for r in self.tabix.fetch(parser=pysam.asGFF3()):
            r.contig = r.contig + "_test_contig"
            r.source = r.source + "_test_source"
            r.feature = r.feature + "_test_feature"
            r.start += 10
            r.end += 10
            r.score = 20
            r.strand = "+"
            r.frame = 0
            r.ID = "test"
            sr = str(r)
            self.assertTrue("_test_contig" in sr)
            self.assertTrue("_test_source" in sr)
            self.assertTrue("_test_feature" in sr)
            self.assertTrue("ID=test" in sr)
示例#3
0
def main(argv=None):
    '''
    main function
    '''

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--is-gff3",
                      dest="gff3_input",
                      action="store_true",
                      help="filename in gff3 format"
                      "[default=%default].")

    parser.add_option("-o",
                      "--output-only-attributes",
                      dest="only_attributes",
                      action="store_true",
                      help="output only attributes as separate columns "
                      "[default=%default].")

    parser.add_option("-f",
                      "--attributes-as-columns",
                      dest="output_full",
                      action="store_true",
                      help="output attributes as separate columns "
                      "[default=%default].")

    parser.add_option("-i",
                      "--invert",
                      dest="invert",
                      action="store_true",
                      help="convert tab-separated table back to gtf "
                      "[default=%default].")

    parser.add_option("-m",
                      "--output-map",
                      dest="output_map",
                      type="choice",
                      choices=("transcript2gene", "peptide2gene",
                               "peptide2transcript"),
                      help="output a map mapping transcripts to genes "
                      "[default=%default].")

    parser.set_defaults(only_attributes=False,
                        output_full=False,
                        invert=False,
                        output_map=None,
                        gff3_input=False)

    (options, args) = E.Start(parser, argv=argv)

    if options.output_full:
        # output full table with column for each attribute

        # to specify gff3 format
        if options.gff3_input is True:
            gff = pysam.tabix_iterator(options.stdin, parser=pysam.asGFF3())
            attributes = set()
            data = []
            for line in gff:
                # get keys to write out to header
                data.append(line)
                attributes = attributes.union(set(line.keys()))

            attributes = sorted(list(attributes))

            header = [
                "contig", "source", "feature", "start", "end", "score",
                "strand", "frame"
            ] + attributes

            options.stdout.write("\t".join(header) + "\n")

            for gff3 in data:
                for a in header:
                    val = getattr(gff3, a)
                    options.stdout.write("%s\t" % (val))
                options.stdout.write("\n")

        else:

            attributes = set()
            data = []
            for gtf in GTF.iterator(options.stdin):
                data.append(gtf)
                attributes = attributes.union(set(gtf.keys()))

            # remove gene_id and transcript_id, as they are used
            # explicitely later
            attributes.difference_update(["gene_id", "transcript_id"])

            attributes = sorted(list(attributes))

            if options.only_attributes:
                header = ["gene_id", "transcript_id"] + attributes
            else:
                header = [
                    "contig",
                    "source",
                    "feature",
                    "start",
                    "end",
                    "score",
                    "strand",
                    "frame",
                    "gene_id",
                    "transcript_id",
                ] + attributes

            options.stdout.write("\t".join(header) + "\n")

            if options.only_attributes:
                for gtf in data:
                    options.stdout.write("\t".join(
                        map(str, (
                            gtf.gene_id,
                            gtf.transcript_id,
                        ))))
                    for a in attributes:
                        if a in ("gene_id", "transcript_id"):
                            continue
                        try:
                            val = getattr(gtf, a)
                        except AttributeError:
                            val = ""
                        except KeyError:
                            val = ""
                        options.stdout.write("\t%s" % val)

                    options.stdout.write("\n")
            else:
                for gtf in data:
                    options.stdout.write("\t".join(
                        map(str, (
                            gtf.contig,
                            gtf.source,
                            gtf.feature,
                            gtf.start,
                            gtf.end,
                            gtf.score,
                            gtf.strand,
                            gtf.frame,
                            gtf.gene_id,
                            gtf.transcript_id,
                        ))))
                    for a in attributes:
                        try:
                            val = getattr(gtf, a)
                        except AttributeError:
                            val = ""
                        options.stdout.write("\t%s" % val)
                    options.stdout.write("\n")

    elif options.invert:

        gtf = GTF.Entry()
        header = None
        for line in options.stdin:
            if line.startswith("#"):
                continue
            data = line[:-1].split("\t")
            if not header:
                header = data
                map_header2column = dict([(y, x)
                                          for x, y in enumerate(header)])
                continue

            # fill gtf entry with data
            try:
                gtf.contig = data[map_header2column["contig"]]
                gtf.source = data[map_header2column["source"]]
                gtf.feature = data[map_header2column["feature"]]
                # subtract -1 to start for 0-based coordinates
                gtf.start = int(data[map_header2column["start"]])
                gtf.end = int(data[map_header2column["end"]])
                gtf.score = data[map_header2column["score"]]
                gtf.strand = data[map_header2column["strand"]]
                gtf.frame = data[map_header2column["frame"]]
                gtf.gene_id = data[map_header2column["gene_id"]]
                gtf.transcript_id = data[map_header2column["transcript_id"]]
                gtf.parseInfo(data[map_header2column["attributes"]], line)
            except KeyError as msg:
                raise KeyError("incomplete entry %s: %s: %s" %
                               (str(data), str(map_header2column), msg))
            # output gtf entry in gtf format
            options.stdout.write("%s\n" % str(gtf))

    elif options.output_map:

        if options.output_map == "transcript2gene":
            fr = lambda x: x.transcript_id
            to = lambda x: x.gene_id
            options.stdout.write("transcript_id\tgene_id\n")
        elif options.output_map == "peptide2gene":
            fr = lambda x: x.protein_id
            to = lambda x: x.gene_id
            options.stdout.write("peptide_id\tgene_id\n")
        elif options.output_map == "peptide2transcript":
            fr = lambda x: x.protein_id
            to = lambda x: x.transcript_id
            options.stdout.write("peptide_id\ttranscript_id\n")

        map_fr2to = {}
        for gtf in GTF.iterator(options.stdin):
            try:
                map_fr2to[fr(gtf)] = to(gtf)
            except AttributeError:
                pass

        for x, y in sorted(map_fr2to.items()):
            options.stdout.write("%s\t%s\n" % (x, y))
    else:
        header = ("contig", "source", "feature", "start", "end", "score",
                  "strand", "frame", "gene_id", "transcript_id", "attributes")
        options.stdout.write("\t".join(header) + "\n")

        for gtf in GTF.iterator(options.stdin):

            attributes = []
            for a in list(gtf.keys()):
                if a in ("gene_id", "transcript_id"):
                    continue
                attributes.append('%s %s' % (a, GTF.quote(gtf[a])))

            attributes = "; ".join(attributes)

            options.stdout.write("\t".join(
                map(str, (
                    gtf.contig,
                    gtf.source,
                    gtf.feature,
                    gtf.start,
                    gtf.end,
                    GTF.toDot(gtf.score),
                    gtf.strand,
                    gtf.frame,
                    gtf.gene_id,
                    gtf.transcript_id,
                    attributes,
                ))) + "\n")
    E.Stop()
示例#4
0
def open_gff3(filename: str) -> Iterator[Any]:
    parser = pysam.asGFF3()
    with open(filename, "rb") as handle:
        for line in handle:
            if not line.startswith(b"#"):
                yield parser(line, len(line))
示例#5
0
 def __init__(self, infile, *args, **kwargs):
     self.gff = pysam.tabix_iterator(iotools.open_file(infile),
                                     parser=pysam.asGFF3())
示例#6
0
        "gff", help="positional input, must be a tabix gff file with tbi")
    parser.add_argument("outbam", help="positional output, must be bam")
    parser.add_argument("pkl", help="output pkl file")
    parser.add_argument("-n",
                        "--np",
                        help="minimum number of passes to mark",
                        type=int,
                        default=5)
    parser.add_argument('-d',
                        help="store args.d as true if -d",
                        action="store_true",
                        default=False)
    args = parser.parse_args()

    bam = pysam.AlignmentFile(args.ccs, check_sq=False)
    gff = pysam.TabixFile(args.gff, parser=pysam.asGFF3())
    obam = pysam.AlignmentFile(args.outbam, "wb", template=bam)

    contigs = set(gff.contigs)
    tmp_d = {
        "name": [],
        "np": [],
        "length": [],
        "positions": [],
        "qualities": []
    }
    for idx, aln in enumerate(bam.fetch(until_eof=True)):
        if ((aln.query_name in contigs) and (aln.get_tag("np") >= args.np)):
            new_tags = get_mod_tags(gff, aln.query_name, aln)
            if (new_tags is not None):
                aln.set_tag("MM", new_tags[0])
 def __init__(self, infile, *args, **kwargs):
     self.gff = pysam.tabix_iterator(IOTools.openFile(infile),
                                     parser=pysam.asGFF3())