예제 #1
0
    def _mainLoop(self):

        # Read in the existing modifications.gff
        modReader = GffReader(self.args.modifications)

        # Set up some additional headers to be injected
        headers = [
            ('source', 'kineticModificationCaller 1.3.1'),
            ('source-commandline', " ".join(sys.argv)),
            ('attribute-description', 'modsfwd - count of detected DNA modifications on forward strand'),
            ('attribute-description', 'modsrev - count of detected DNA modifications on reverse strand')
        ]

        # Get modification calls
        hits = [{"pos": x.start, "strand": x.strand} for x in modReader if x.type == 'modified_base']

        # Summary reader
        summaryFile = file(self.args.alignmentSummary)

        # Modified gff file
        summaryWriter = file(self.args.outfile, "w")

        self.seqMap = {}
        inHeader = True

        # Loop through
        for line in summaryFile:
            # Pass any metadata line straight through
            if line[0] == "#":

                # Parse headers
                splitFields = line.replace('#', '').split(' ')
                field = splitFields[0]
                value = " ".join(splitFields[1:])
                if field == 'sequence-header':
                    [internalTag, delim, externalTag] = value.strip().partition(' ')
                    self.seqMap[internalTag] = externalTag
                print >>summaryWriter, line.strip()
                continue

            if inHeader:
                # We are at the end of the header -- write the tool-specific headers
                for field in headers:
                    print >>summaryWriter, ("##%s %s" % field)
                inHeader = False

            # Parse the line
            rec = Gff3Record.fromString(line)

            if rec.type == 'region':
                # Get the hits in this interval, add them to the gff record
                intervalHits = [h for h in hits if rec.start <= h['pos'] <= rec.end]
                strand0Hits = len([h for h in intervalHits if h['strand'] == '+'])
                strand1Hits = len([h for h in intervalHits if h['strand'] == '-'])

                rec.modsfwd = strand0Hits
                rec.modsrev = strand1Hits

                print >>summaryWriter, str(rec)
예제 #2
0
 def test_fromString(self):
     newRecord = Gff3Record.fromString(str(self.RECORD))
     assert str(self.RECORD) == str(newRecord)
예제 #3
0
 def test_fromString(self):
     newRecord = Gff3Record.fromString(str(self.record))
     assert_equal(str(self.record),  str(newRecord))
def main():
    headers = [
        ("source", "GenomicConsensus %s" % __VERSION__),
        ("pacbio-alignment-summary-version", "0.6"),
        ("source-commandline", " ".join(sys.argv)),
    ]

    desc = "Augment the alignment_summary.gff file with consensus and variants information."
    parser = argparse.ArgumentParser(description=desc)
    parser.add_argument("--variantsGff",
                        type=str,
                        help="Input variants.gff or variants.gff.gz filename",
                        required=True)
    parser.add_argument("--output",
                        "-o",
                        type=str,
                        help="Output alignment_summary.gff filename")
    parser.add_argument("inputAlignmentSummaryGff",
                        type=str,
                        help="Input alignment_summary.gff filename")

    options = parser.parse_args()

    inputVariantsGff = GffReader(options.variantsGff)
    inputAlignmentSummaryGff = GffReader(options.inputAlignmentSummaryGff)

    summaries = {}
    for gffRecord in inputAlignmentSummaryGff:
        region = Region(gffRecord.seqid, gffRecord.start, gffRecord.end)
        summaries[region] = {"ins": 0, "del": 0, "sub": 0, "cQv": (0, 0, 0)}
    inputAlignmentSummaryGff.close()

    counterNames = {
        "insertion": "ins",
        "deletion": "del",
        "substitution": "sub"
    }
    for variantGffRecord in inputVariantsGff:
        for region in summaries:
            summary = summaries[region]
            if (region.seqid == variantGffRecord.seqid
                    and region.start <= variantGffRecord.start <= region.end):
                counterName = counterNames[variantGffRecord.type]
                variantLength = max(len(variantGffRecord.reference),
                                    len(variantGffRecord.variantSeq))
                summary[counterName] += variantLength
            # TODO: base consensusQV on effective coverage
            summary["cQv"] = (20, 20, 20)

    inputAlignmentSummaryGff = open(options.inputAlignmentSummaryGff)
    outputAlignmentSummaryGff = open(options.output, "w")

    inHeader = True

    for line in inputAlignmentSummaryGff:
        line = line.rstrip()

        # Pass any metadata line straight through
        if line[0] == "#":
            print >> outputAlignmentSummaryGff, line.strip()
            continue

        if inHeader:
            # We are at the end of the header -- write the tool-specific headers
            for k, v in headers:
                print >> outputAlignmentSummaryGff, ("##%s %s" % (k, v))
            inHeader = False

        # Parse the line
        rec = Gff3Record.fromString(line)

        if rec.type == "region":
            summary = summaries[(rec.seqid, rec.start, rec.end)]
            if "cQv" in summary:
                cQvTuple = summary["cQv"]
                line += ";%s=%s" % ("cQv", ",".join(
                    str(int(f)) for f in cQvTuple))
            for counterName in counterNames.values():
                if counterName in summary:
                    line += ";%s=%d" % (counterName, summary[counterName])
            print >> outputAlignmentSummaryGff, line
    def _mainLoop(self):

        # Read in the existing modifications.gff
        modReader = GffReader(self.modifications)

        headerString = ",".join(['"' + x + '"' for x in self.knownModificationEvents])

        # Set up some additional headers to be injected
        headers = [
            ("source", "kineticModificationCaller 1.3.3"),
            ("source-commandline", " ".join(sys.argv)),
            (
                "attribute-description",
                "modsfwd - count of detected DNA modifications on forward strand by modification event type",
            ),
            (
                "attribute-description",
                "modsrev - count of detected DNA modifications on reverse strand by modification event type",
            ),
            ("region-modsfwd", headerString),
            ("region-modsfwd", headerString),
        ]

        hitsByEvent = dict([(x, []) for x in self.knownModificationEvents])

        # Get modification calls
        hits = [
            {"pos": x.start, "strand": x.strand, "seqid": x.seqid, "type": x.type}
            for x in modReader
            if x.type in self.knownModificationEvents
        ]

        # Summary reader
        summaryFile = file(self.alignmentSummary)

        # Modified gff file
        summaryWriter = file(self.outfile, "w")

        self.seqMap = {}
        inHeader = True

        # Loop through
        for line in summaryFile:
            # Pass any metadata line straight through
            if line[0] == "#":

                # Parse headers
                splitFields = line.replace("#", "").split(" ")
                field = splitFields[0]
                value = " ".join(splitFields[1:])
                if field == "sequence-header":
                    [internalTag, delim, externalTag] = value.strip().partition(" ")
                    self.seqMap[internalTag] = externalTag
                print >> summaryWriter, line.strip()
                continue

            if inHeader:
                # We are at the end of the header -- write the tool-specific headers
                for field in headers:
                    print >> summaryWriter, ("##%s %s" % field)
                inHeader = False

            # Parse the line
            rec = Gff3Record.fromString(line)

            if rec.type == "region":
                # Get the hits in this interval, add them to the gff record
                intervalHits = [h for h in hits if rec.start <= h["pos"] <= rec.end and rec.seqid == h["seqid"]]

                cFwd = self.countModificationTypes([h for h in intervalHits if h["strand"] == "+"])
                cRev = self.countModificationTypes([h for h in intervalHits if h["strand"] == "-"])

                rec.modsfwd = ",".join([str(cFwd[x]) for x in self.knownModificationEvents])
                rec.modsrev = ",".join([str(cRev[x]) for x in self.knownModificationEvents])

                print >> summaryWriter, str(rec)
        return 0
 def test_fromString(self):
     newRecord = Gff3Record.fromString(str(self.record))
     assert_equal(str(self.record), str(newRecord))
예제 #7
0
    def _mainLoop(self):

        # Read in the existing modifications.gff
        modReader = GffReader(self.modifications)

        headerString = ",".join(
            ['"' + x + '"' for x in self.knownModificationEvents])

        # Set up some additional headers to be injected
        headers = [
            ('source', 'kineticModificationCaller 1.3.3'),
            ('source-commandline', " ".join(sys.argv)),
            ('attribute-description',
             'modsfwd - count of detected DNA modifications on forward strand by modification event type'
             ),
            ('attribute-description',
             'modsrev - count of detected DNA modifications on reverse strand by modification event type'
             ), ('region-modsfwd', headerString),
            ('region-modsfwd', headerString)
        ]

        hitsByEvent = dict([(x, []) for x in self.knownModificationEvents])

        # Get modification calls
        hits = [{
            "pos": x.start,
            "strand": x.strand,
            "seqid": x.seqid,
            "type": x.type
        } for x in modReader if x.type in self.knownModificationEvents]

        # Summary reader
        summaryFile = file(self.alignmentSummary)

        # Modified gff file
        summaryWriter = file(self.outfile, "w")

        self.seqMap = {}
        inHeader = True

        # Loop through
        for line in summaryFile:
            # Pass any metadata line straight through
            if line[0] == "#":

                # Parse headers
                splitFields = line.replace('#', '').split(' ')
                field = splitFields[0]
                value = " ".join(splitFields[1:])
                if field == 'sequence-header':
                    [internalTag, delim,
                     externalTag] = value.strip().partition(' ')
                    self.seqMap[internalTag] = externalTag
                print(line.strip(), file=summaryWriter)
                continue

            if inHeader:
                # We are at the end of the header -- write the tool-specific headers
                for field in headers:
                    print(("##%s %s" % field), file=summaryWriter)
                inHeader = False

            # Parse the line
            rec = Gff3Record.fromString(line)

            if rec.type == 'region':
                # Get the hits in this interval, add them to the gff record
                intervalHits = [
                    h for h in hits if rec.start <= h['pos'] <= rec.end
                    and rec.seqid == h['seqid']
                ]

                cFwd = self.countModificationTypes(
                    [h for h in intervalHits if h['strand'] == '+'])
                cRev = self.countModificationTypes(
                    [h for h in intervalHits if h['strand'] == '-'])

                rec.modsfwd = ",".join(
                    [str(cFwd[x]) for x in self.knownModificationEvents])
                rec.modsrev = ",".join(
                    [str(cRev[x]) for x in self.knownModificationEvents])

                print(str(rec), file=summaryWriter)
        return 0