Exemplo n.º 1
0
def _get_contigs_to_plot(alignment_summ_gff, contigs):
    """
    Returns a dict (string: ContigCoverage) that maps a contig ID to its coverage object.
    :param alignment_summ_gff: (str) path to alignment_summ_gff
    :param contigs: (list) top contigs from reference
    """

    def _get_name(id_):
        for c in contigs:
            if c.id == id_:
                return c.name

    cov_map = {}
    contig_ids = [c.id for c in contigs]

    reader = GffReader(alignment_summ_gff)
    for rec in reader:
        if rec.seqid not in contig_ids:
            log.info(
                "Unable to find gff '{i}' in alignment contig ids.".format(i=rec.seqid))
            continue

        try:
            contig_cov = cov_map[rec.seqid]
        except KeyError:
            contig_cov = ContigCoverage(rec.seqid, _get_name(rec.seqid))
            cov_map[rec.seqid] = contig_cov

        contig_cov.add_data(rec)

    reader.close()

    return cov_map
Exemplo n.º 2
0
def _get_contigs_to_plot(alignment_summ_gff, contigs):
    """
    Returns a dict (string: ContigCoverage) that maps a contig ID to its coverage object.
    :param alignment_summ_gff: (str) path to alignment_summ_gff
    :param contigs: (list) top contigs from reference
    """
    def _get_name(id_):
        for c in contigs:
            if c.id == id_:
                return c.name

    cov_map = {}
    contig_ids = [c.id for c in contigs]

    reader = GffReader(alignment_summ_gff)
    for rec in reader:
        if rec.seqid not in contig_ids:
            log.info("Skipping seqid '{i}'.".format(i=rec.seqid))
            continue

        try:
            contig_cov = cov_map[rec.seqid]
        except KeyError:
            contig_cov = ContigCoverage(rec.seqid, _get_name(rec.seqid))
            cov_map[rec.seqid] = contig_cov

        contig_cov.add_data(rec)

    reader.close()

    return cov_map
 def setUpClass(cls):
     super(TestModificationsOutput, cls).setUpClass()
     datastore = DataStore.from_job_path(cls.job_dir)
     entrypoints = EntryPoints.from_job_path(cls.job_dir)
     cls.h5_file = None
     cls.bw_file = None
     cls.gff_file = None
     for file_id, file_info in datastore.get_file_dict().iteritems():
         if file_info.is_chunked:
             continue
         if file_info.file_type_id == FileTypes.GFF.file_type_id:
             with GffReader(file_info.path) as gff:
                 for header in gff.headers:
                     if header.startswith("##source ipdSummary"):
                         cls.gff_file = file_info.path
         elif file_info.file_type_id == FileTypes.H5.file_type_id:
             cls.h5_file = file_info.path
         elif file_info.file_type_id == FileTypes.BIGWIG.file_type_id:
             cls.bw_file = file_info.path
     with GffReader(cls.gff_file) as gff:
         cls.gff_records = [rec for rec in gff]
     cls.gff_dict = {}
     for rec in cls.gff_records:
         cls.gff_dict[(rec.seqid, rec.start, rec.strand)] = rec
     ref = entrypoints.data['eid_ref_dataset']
     cls.seqids = []
     with ReferenceSet(ref) as rs:
         for i_ref, ctg in enumerate(rs):
             cls.seqids.append(ctg.id)
Exemplo n.º 4
0
def _get_contig_coverage(alignment_summ_gff, contigs):
    """
    Modifies the passed contigs object to include coverage information.
    :param alignment_summ_gff: (str) path to alignment_summ_gff
    :param contigs: (dict) contig id -> ContigInfo object
    """
    reader = GffReader(alignment_summ_gff)
    for rec in reader:
        # Some contigs don't have any coverage, but make it into the gff file
        if rec.seqid in contigs:
            contigs[rec.seqid].add_coverage_data(rec)

    reader.close()
Exemplo n.º 5
0
def _get_contig_coverage(alignment_summ_gff, contigs):
    """
    Modifies the passed contigs object to include coverage information.
    :param alignment_summ_gff: (str) path to alignment_summ_gff
    :param contigs: (dict) contig id -> ContigInfo object
    """
    reader = GffReader(alignment_summ_gff)
    for rec in reader:
        # Some contigs don't have any coverage, but make it into the gff file
        if rec.seqid in contigs:
            contigs[rec.seqid].add_coverage_data(rec)

    reader.close()
Exemplo n.º 6
0
 def run(self):
     with GffReader(self.gffFile) as reader, \
          VcfWriter(sys.stdout) as writer:
         self._writeMetaData(writer)
         for gff in reader:
             vcf = VcfRecord.fromVariantGffRecord(gff)
             writer.writeRecord(vcf)
 def test_merge_gffs_sorted(self):
     gff_out = "tmp_pbcore_merged_sorted.gff"
     merge_gffs_sorted(self.files, gff_out)
     with GffReader(gff_out) as f:
         start = [(rec.seqid, rec.start) for rec in f]
         self.assertEqual(start, self.sorted_start)
     rm_out(gff_out)
Exemplo n.º 8
0
 def test_merge_gffs_sorted(self):
     gff_out = "tmp_pbcore_merged_sorted.gff"
     merge_gffs_sorted(self.files, gff_out)
     with GffReader(gff_out) as f:
         start = [(rec.seqid, rec.start) for rec in f]
         assert start == self.SORTED_START
     rm_out(gff_out)
Exemplo n.º 9
0
 def test_merge_gffs(self):
     gff_out = "tmp_pbcore_merge.gff"
     merge_gffs(self.files, gff_out)
     n_rec = 0
     for fn in self.files:
         with GffReader(fn) as f:
             n_rec += len([rec for rec in f])
     with GffReader(gff_out) as f:
         assert f.headers == [
             "##gff-version 3",
             "##source ipdSummary",
             "##sequence-region lambda_NEB3011 1 48502",
         ]
         n_rec_merged = len([rec for rec in f])
         assert n_rec == n_rec_merged
     rm_out(gff_out)
Exemplo n.º 10
0
    def _queueChunksForReference(self):

        # Read in motif_summary.csv
        motifInfo = {}
        reader = csv.reader(open(self.args.motif_summary, 'r'), delimiter=',')
        reader.next()
        if self.options.oldData:
            col = 1
        else:
            col = 2
        for row in reader:
            motifInfo[row[0]] = row[col]

        # Figure out the length of the motifs file:
        motReader = GffReader(self.args.motifs)
        if self.options.undetected:
            motifDicts = [{
                "seqID": x.seqid,
                "type": x.type,
                "score": x.score,
                "pos": x.start,
                "strand": x.strand,
                "attributes": x.attributes
            } for x in motReader if x.type == '.']
        else:
            motifDicts = [{
                "seqID": x.seqid,
                "type": x.type,
                "score": x.score,
                "pos": x.start,
                "strand": x.strand,
                "attributes": x.attributes
            } for x in motReader]

        refLength = len(motifDicts)

        # Maximum number of hits per chunk
        MAX_HITS = 500
        nBases = min(refLength, self.args.maxLength)
        nBlocks = max(self.options.numWorkers * 4, nBases / MAX_HITS)

        # Block layout
        blockSize = min(nBases, max(nBases / nBlocks + 1, 100))
        blockStarts = np.arange(0, nBases, step=blockSize)
        blockEnds = blockStarts + blockSize
        blocks = zip(blockStarts, blockEnds)

        if self.options.undetected:
            self.options.modifications = None

        # Queue up work blocks
        for block in blocks:
            # NOTE! The format of a work chunk is (refId <int>, refStartBase <int>, refEndBase <int>)
            # chunk = (refInfoId, block[0], block[1])
            # chunk = (self.options.motifs, self.refInfo, motifInfo, self.options.modifications, self.options.undetected, self.options.oldData, block[0], block[1])
            chunk = (motifDicts[block[0]:block[1]], self.refInfo, motifInfo,
                     self.options.modifications, self.options.undetected,
                     self.options.oldData, block[0], block[1])
            self._workQueue.put((self.workChunkCounter, chunk))
            self.workChunkCounter += 1
Exemplo n.º 11
0
    def find_top(self):
        """Sorting strategy here is to sort sublists and truncate down to list size as
        we go. So, for a 10000 line file, 1000 batchSortSize, and 100 final (howMany),
        iterate through the file, and sort each 1000 chunk, take the top 100"""

        reader = None

        with GffReader(self._variantsGff) as reader:

            locallist = []
            count = 0

            for gff3Record in reader:

                if count == self._batchSortSize:
                    locallist = self._sortAndTrim(locallist)
                    count = 0

                locallist.append(Variant(gff3Record))
                count = count + 1
            if count == 0:
                return []

            finalList = self._sortAndTrim(locallist)
            self._addContigNames(finalList)

            return finalList
Exemplo n.º 12
0
class TestGffReader:

    RAWFILE = open(data.getGff3())
    READER = GffReader(data.getGff3())

    def test_headers(self):
        assert [
            "##gff-version 3", "##pacbio-variant-version 2.1",
            "##date Sat Mar 22 12:16:13 2014",
            "##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.12",
            "##source GenomicConsensus 0.8.0",
            "##source-commandline /Users/dalexander/.virtualenvs/VE/bin/variantCaller.py --algorithm=plurality -q20 -x5 pbcore/data/aligned_reads_1.cmp.h5 -r /Users/dalexander/Data/lambdaNEB.fa -o /tmp/v.gff",
            "##source-alignment-file /Users/dalexander/Dropbox/Sources/git/pbcore/pbcore/data/aligned_reads_1.cmp.h5",
            "##source-reference-file /Users/dalexander/Data/lambdaNEB.fa",
            "##sequence-region lambda_NEB3011 1 48502"
        ] == self.READER.headers

    def test__iter__(self):
        records = list(self.READER)
        rawLines = self.RAWFILE.readlines()[9:]
        for record, rawLine in zip(records, rawLines):
            # No newlines or whitespace allowed in records
            assert str(record).strip() == str(record)
            # Make sure record matches line
            assert rawLine.strip() == str(record)
Exemplo n.º 13
0
    def _mainLoop(self):

        # Read in the existing modifications.gff
        modReader = GffReader(self.args.modifications)

        # Set up some additional headers to be injected
        headers = [
            ('source', 'kineticModificationCaller 1.3.1'),
            ('source-commandline', " ".join(sys.argv)),
            ('attribute-description', 'modsfwd - count of detected DNA modifications on forward strand'),
            ('attribute-description', 'modsrev - count of detected DNA modifications on reverse strand')
        ]

        # Get modification calls
        hits = [{"pos": x.start, "strand": x.strand} for x in modReader if x.type == 'modified_base']

        # Summary reader
        summaryFile = file(self.args.alignmentSummary)

        # Modified gff file
        summaryWriter = file(self.args.outfile, "w")

        self.seqMap = {}
        inHeader = True

        # Loop through
        for line in summaryFile:
            # Pass any metadata line straight through
            if line[0] == "#":

                # Parse headers
                splitFields = line.replace('#', '').split(' ')
                field = splitFields[0]
                value = " ".join(splitFields[1:])
                if field == 'sequence-header':
                    [internalTag, delim, externalTag] = value.strip().partition(' ')
                    self.seqMap[internalTag] = externalTag
                print >>summaryWriter, line.strip()
                continue

            if inHeader:
                # We are at the end of the header -- write the tool-specific headers
                for field in headers:
                    print >>summaryWriter, ("##%s %s" % field)
                inHeader = False

            # Parse the line
            rec = Gff3Record.fromString(line)

            if rec.type == 'region':
                # Get the hits in this interval, add them to the gff record
                intervalHits = [h for h in hits if rec.start <= h['pos'] <= rec.end]
                strand0Hits = len([h for h in intervalHits if h['strand'] == '+'])
                strand1Hits = len([h for h in intervalHits if h['strand'] == '-'])

                rec.modsfwd = strand0Hits
                rec.modsrev = strand1Hits

                print >>summaryWriter, str(rec)
Exemplo n.º 14
0
def _extract_alignment_summ_data(aln_summ_gff, contigs):
    """
    :param aln_summ_gff: (str) path to alignment_summary.gff
    :param contigs: (list) top contigs from reference
    :returns: 2 dictionaries containing data extracted from alignment_summary.gff
    """

    def _get_name(id_):
        for c in contigs:
            if c.id == id_:
                return c.name

    contig_ids = [c.id for c in contigs]

    ref_data = {}
    var_map = {}

    log.info("Reading GFF data from {f}".format(f=aln_summ_gff))

    reader = GffReader(aln_summ_gff)
    for rec in reader:
        seqid = rec.seqid.split()[0]
        if seqid not in contig_ids:
            continue

        # first data set
        ref_data.setdefault(seqid, [0, 0, 0, 0])
        ref_data[seqid][LENGTH] = max(rec.end, ref_data[seqid][LENGTH])
        numGaps, lenGaps = rec.attributes["gaps"].split(",")
        ref_data[seqid][GAPS] += int(lenGaps)
        ref_data[seqid][COV] += float( rec.attributes["cov2"].split(",")[0] ) * \
            (rec.end - rec.start + 1)

        # second data set
        contig_var = None
        try:
            contig_var = var_map[seqid]
        except KeyError:
            contig_var = ContigVariants(seqid, _get_name(seqid))
            var_map[seqid] = contig_var

        contig_var.add_data(rec)

    reader.close()

    return ref_data, var_map
 def test_gff_file_headers(self):
     """
     Check that every GFF file contains headers.
     """
     for gff_file in self.gff_files:
         with GffReader(gff_file) as r:
             self.assertTrue(
                 len(r.headers) > 0, "No headers in %s" % gff_file)
Exemplo n.º 16
0
 def test_reprocessed_gff_has_motifs(self):
     if len(self.motif_records) == 0:
         raise SkipTests("No motifs found, so none expected in GFF")
     n_motif_annotations = 0
     with GffReader(self.motifs_gff) as gff:
         for rec in gff:
             if "motif" in rec.attributes:
                 n_motif_annotations += 1
     log.info("Found {n} annotations".format(n=n_motif_annotations))
     self.assertTrue(n_motif_annotations > 0,
                     "No motif annotations found in reprocessed GFF")
 def test_gff_seqid_is_fasta_identifier(self):
     """
     Check that GFF files use only the identifier part of FASTA headers,
     no spaces allowed - see ticket 28667
     """
     for gff_file in self.gff_files:
         with GffReader(gff_file) as r:
             for rec in r:
                 self.assertTrue(
                     not " " in rec.seqid,
                     "seqid contains spaces:\n%s\n(file: %s)" %
                     (str(rec), gff_file))
Exemplo n.º 18
0
def _append_variants_gff_data(ref_data, variants_gff):
    """
    Adds data from variants gff to the ref_data dict
    :param ref_data: (dict) dict of data pulled from alignment_summary.gff
    :param variants_gff: (str) path to variants_gff

    :type variants_gff: str
    """
    reader = GffReader(variants_gff)
    for record in reader:
        err_len = record.end - record.start + 1
        seqid = record.seqid.split()[0]
        if seqid in ref_data:
            ref_data[seqid][ERR] += err_len
        else:
            # the variants might not be present in the top 25 contigs,
            # so we can just raise a warning in the log.
            msg = "Unable to find {r} in {f}".format(r=seqid, f=variants_gff)
            log.warn(msg)

    reader.close()
Exemplo n.º 19
0
    def run(self):
        with GffReader(self.gffFile) as reader, \
             BedWriter(sys.stdout)   as writer:

            writer.writeHeader(self.options.name, self.options.description,
                               self.options.useScore)
            for gff in reader:
                if self.purpose == 'coverage':
                    bedRecord = CoverageBedRecord.fromAlignmentSummaryGffRecord(
                        gff)
                else:
                    bedRecord = VariantsBedRecord.fromVariantGffRecord(gff)
                writer.writeRecord(bedRecord)
Exemplo n.º 20
0
 def getMetrics(cls):
     cls.consensus_summary_gff = cls.coverage_summary_gff = None
     for file_id, file_info in cls.datastore.get_file_dict().iteritems():
         if file_info.file_type_id == FileTypes.GFF.file_type_id:
             if "summarize_consensus" in file_info.file_id:
                 cls.consensus_summary_gff = file_info.path
             elif "summarize_coverage" in file_info.file_id:
                 cls.coverage_summary_gff = file_info.path
     cls.consensus_records = []
     cls.coverage_records = []
     if cls.consensus_summary_gff is not None:
         for MID in cls.METRIC_IDS:
             cls.metric_dict[MID] = 0
         with GffReader(cls.consensus_summary_gff) as f:
             for rec in f:
                 cls.consensus_records.append(rec)
                 a = rec.attributes
                 cls.metric_dict["n_deletions"] += int(a["del"])
                 cls.metric_dict["n_insertions"] += int(a["ins"])
                 cls.metric_dict["n_substitutions"] += int(a["sub"])
         with GffReader(cls.coverage_summary_gff) as f:
             cls.coverage_records.extend([rec for rec in f])
Exemplo n.º 21
0
def _append_variants_gff_data(ref_data, variants_gff):
    """
    Adds data from variants gff to the ref_data dict
    :param ref_data: (dict) dict of data pulled from alignment_summary.gff
    :param variants_gff: (str) path to variants_gff

    :type variants_gff: str
    """
    reader = GffReader(variants_gff)
    for record in reader:
        err_len = record.end - record.start + 1
        seqid = record.seqid.split()[0]
        if seqid in ref_data:
            ref_data[seqid][ERR] += err_len
        else:
            # the variants might not be present in the top 25 contigs,
            # so we can just raise a warning in the log.
            msg = "Unable to find {r} in {f}".format(
                r=seqid, f=variants_gff)
            log.warn(msg)

    reader.close()
 def setUpClass(cls):
     with FastaWriter(cls.REFERENCE) as fasta_out:
         with FastaReader(TestCoverageRpt.REFERENCE) as fasta_in:
             for rec in fasta_in:
                 header = rec.id + "|quiver"
                 fasta_out.writeRecord(header, rec.sequence)
     with GffWriter(cls.GFF) as gff_out:
         with GffReader(TestCoverageRpt.GFF) as gff_in:
             for header in gff_in.headers:
                 gff_out.writeHeader(header)
             for rec in gff_in:
                 rec.seqid += "|quiver"
                 gff_out.writeRecord(rec)
 def test_gff_sort_order(self):
     """
     Check that records in all GFF output files are in sorted order
     (verification for bug 27785).
     """
     for gff_file in self.gff_files:
         with GffReader(gff_file) as gff:
             last_rec = None
             for rec in gff:
                 if last_rec is not None and rec.seqid == last_rec.seqid:
                     self.assertTrue(
                         rec.start >= last_rec.start,
                         "Records occur out of order:\n{l}\n{r}".format(
                             r=rec, l=last_rec))
                 last_rec = rec
Exemplo n.º 24
0
    def _mainLoop(self):

        # Read in the existing modifications.gff
        modReader = GffReader(self.modifications)

        headerString = ",".join(
            ['"' + x + '"' for x in self.knownModificationEvents])

        # Set up some additional headers to be injected
        headers = [
            ('source', 'kineticModificationCaller 1.3.3'),
            ('source-commandline', " ".join(sys.argv)),
            ('attribute-description',
             'modsfwd - count of detected DNA modifications on forward strand by modification event type'
             ),
            ('attribute-description',
             'modsrev - count of detected DNA modifications on reverse strand by modification event type'
             ), ('region-modsfwd', headerString),
            ('region-modsfwd', headerString)
        ]

        hitsByEvent = dict([(x, []) for x in self.knownModificationEvents])

        # Get modification calls
        hits = [{
            "pos": x.start,
            "strand": x.strand,
            "seqid": x.seqid,
            "type": x.type
        } for x in modReader if x.type in self.knownModificationEvents]

        # Summary reader
        summaryFile = file(self.alignmentSummary)

        # Modified gff file
        summaryWriter = file(self.outfile, "w")

        self.seqMap = {}
        inHeader = True

        # Loop through
        for line in summaryFile:
            # Pass any metadata line straight through
            if line[0] == "#":

                # Parse headers
                splitFields = line.replace('#', '').split(' ')
                field = splitFields[0]
                value = " ".join(splitFields[1:])
                if field == 'sequence-header':
                    [internalTag, delim,
                     externalTag] = value.strip().partition(' ')
                    self.seqMap[internalTag] = externalTag
                print(line.strip(), file=summaryWriter)
                continue

            if inHeader:
                # We are at the end of the header -- write the tool-specific headers
                for field in headers:
                    print(("##%s %s" % field), file=summaryWriter)
                inHeader = False

            # Parse the line
            rec = Gff3Record.fromString(line)

            if rec.type == 'region':
                # Get the hits in this interval, add them to the gff record
                intervalHits = [
                    h for h in hits if rec.start <= h['pos'] <= rec.end
                    and rec.seqid == h['seqid']
                ]

                cFwd = self.countModificationTypes(
                    [h for h in intervalHits if h['strand'] == '+'])
                cRev = self.countModificationTypes(
                    [h for h in intervalHits if h['strand'] == '-'])

                rec.modsfwd = ",".join(
                    [str(cFwd[x]) for x in self.knownModificationEvents])
                rec.modsrev = ",".join(
                    [str(cRev[x]) for x in self.knownModificationEvents])

                print(str(rec), file=summaryWriter)
        return 0
Exemplo n.º 25
0
 def test_sort_gff(self):
     gff_out = sort_gff(self.combined)
     with GffReader(gff_out) as f:
         start = [(rec.seqid, rec.start) for rec in f]
         assert start == self.SORTED_START
     rm_out(gff_out)
Exemplo n.º 26
0
 def setup(self):
     self.rawFile = open(data.getGff3())
     self.reader = GffReader(data.getGff3())
def main():
    headers = [
        ("source", "GenomicConsensus %s" % __VERSION__),
        ("pacbio-alignment-summary-version", "0.6"),
        ("source-commandline", " ".join(sys.argv)),
    ]

    desc = "Augment the alignment_summary.gff file with consensus and variants information."
    parser = argparse.ArgumentParser(description=desc)
    parser.add_argument("--variantsGff",
                        type=str,
                        help="Input variants.gff or variants.gff.gz filename",
                        required=True)
    parser.add_argument("--output",
                        "-o",
                        type=str,
                        help="Output alignment_summary.gff filename")
    parser.add_argument("inputAlignmentSummaryGff",
                        type=str,
                        help="Input alignment_summary.gff filename")

    options = parser.parse_args()

    inputVariantsGff = GffReader(options.variantsGff)
    inputAlignmentSummaryGff = GffReader(options.inputAlignmentSummaryGff)

    summaries = {}
    for gffRecord in inputAlignmentSummaryGff:
        region = Region(gffRecord.seqid, gffRecord.start, gffRecord.end)
        summaries[region] = {"ins": 0, "del": 0, "sub": 0, "cQv": (0, 0, 0)}
    inputAlignmentSummaryGff.close()

    counterNames = {
        "insertion": "ins",
        "deletion": "del",
        "substitution": "sub"
    }
    for variantGffRecord in inputVariantsGff:
        for region in summaries:
            summary = summaries[region]
            if (region.seqid == variantGffRecord.seqid
                    and region.start <= variantGffRecord.start <= region.end):
                counterName = counterNames[variantGffRecord.type]
                variantLength = max(len(variantGffRecord.reference),
                                    len(variantGffRecord.variantSeq))
                summary[counterName] += variantLength
            # TODO: base consensusQV on effective coverage
            summary["cQv"] = (20, 20, 20)

    inputAlignmentSummaryGff = open(options.inputAlignmentSummaryGff)
    outputAlignmentSummaryGff = open(options.output, "w")

    inHeader = True

    for line in inputAlignmentSummaryGff:
        line = line.rstrip()

        # Pass any metadata line straight through
        if line[0] == "#":
            print >> outputAlignmentSummaryGff, line.strip()
            continue

        if inHeader:
            # We are at the end of the header -- write the tool-specific headers
            for k, v in headers:
                print >> outputAlignmentSummaryGff, ("##%s %s" % (k, v))
            inHeader = False

        # Parse the line
        rec = Gff3Record.fromString(line)

        if rec.type == "region":
            summary = summaries[(rec.seqid, rec.start, rec.end)]
            if "cQv" in summary:
                cQvTuple = summary["cQv"]
                line += ";%s=%s" % ("cQv", ",".join(
                    str(int(f)) for f in cQvTuple))
            for counterName in counterNames.values():
                if counterName in summary:
                    line += ";%s=%d" % (counterName, summary[counterName])
            print >> outputAlignmentSummaryGff, line
Exemplo n.º 28
0
 def test_sort_gff(self):
     gff_out = sort_gff(self.combined)
     with GffReader(gff_out) as f:
         start = [(rec.seqid, rec.start) for rec in f]
         self.assertEqual(start, self.sorted_start)
     rm_out(gff_out)
Exemplo n.º 29
0
    def onChunk(self, referenceWindow):

        (motifDicts, refInfo, motifInfo, modifFile, undetectedOnly, oldData,
         start, end) = referenceWindow

        print "Start = ", start, " End = ", end

        # Read in the modifications GFF if needed:

        if not undetectedOnly:
            modReader = GffReader(modifFile)
            modifDicts = [{
                "seqID": x.seqid,
                "type": x.type,
                "score": x.score,
                "pos": x.start,
                "strand": x.strand,
                "attributes": x.attributes
            } for x in modReader]

        # To help find the correct reference for an entry in motifs.gff
        self.selectRef = [x.FullName for x in refInfo]

        # Loop through the rows of the motifs GFF file:
        self.pre = self.ipdModel.gbmModel.post
        self.post = self.ipdModel.gbmModel.pre
        collectResults = []
        modificationsLinecount = 0

        for d in motifDicts:

            if d["type"] != '.' and undetectedOnly:
                # Go on to the next row
                continue

            ref = refInfo[self.getReferenceIndex(d, oldData)]
            self.refId = ref.ID

            k = self.fillOutEasyInformation(d, motifInfo)
            if not "motif" in k.keys():
                # If no motif is listed in this row, go on to the next row
                continue

            if d["type"] != '.' and not undetectedOnly:

                # Search sorted list of template positions in modifications GFF for a match:
                for y in modifDicts[modificationsLinecount:]:
                    if y["pos"] == d["pos"] and y["strand"] == d[
                            "strand"] and y["seqID"] == d["seqID"]:
                        break
                    modificationsLinecount += 1

                # Once the match is found, copy in the modified fraction estimate
                if modificationsLinecount <= len(modifDicts):
                    u = y["attributes"]

                    if FRAC in u:
                        k[FRAC] = float(u[FRAC])
                        k[FRAClow] = float(u[FRAClow])
                        k[FRACup] = float(u[FRACup])

            if d["type"] == '.':

                # See update to ResultsWriter: 'nMd' is 'not modified'
                k['modification'] = 'nMd'

                # Figure out modification type:
                if oldData:
                    self.modificationType = self.oldDataModificationType(
                        motifInfo)
                else:
                    self.modificationType = motifInfo[k['motif']]

                # Select a window around the current position to use for estimation
                stop = k["tpl"] + self.post
                start = min(max(1, (k["tpl"] - self.pre)), stop)

                # Trim end coordinate to length of current template
                # end = min(end, self.ipdModel.refLength(self.refId))

                # Try to estimate the modified fraction:
                if self.modificationType == 'modified_base':
                    # In this case, we'll need the mean Ipd function:
                    self.meanIpdFunc = self.ipdModel.predictIpdFunc(self.refId)

                self.strand = k['strand']
                perSiteResults = self._summarizeReferenceRegion((start, stop))

                if self.modificationType == 'modified_base':
                    k[FRAC] = perSiteResults[self.post - 1][FRAC]
                    k[FRAClow] = perSiteResults[self.post - 1][FRAClow]
                    k[FRACup] = perSiteResults[self.post - 1][FRACup]

                else:
                    mods = self._decodePositiveControl(perSiteResults,
                                                       (start, stop))
                    k[FRAC] = mods[0]
                    k[FRAClow] = mods[1]
                    k[FRACup] = mods[2]

            collectResults.append(k)

        return collectResults