def _get_contigs_to_plot(alignment_summ_gff, contigs): """ Returns a dict (string: ContigCoverage) that maps a contig ID to its coverage object. :param alignment_summ_gff: (str) path to alignment_summ_gff :param contigs: (list) top contigs from reference """ def _get_name(id_): for c in contigs: if c.id == id_: return c.name cov_map = {} contig_ids = [c.id for c in contigs] reader = GffReader(alignment_summ_gff) for rec in reader: if rec.seqid not in contig_ids: log.info( "Unable to find gff '{i}' in alignment contig ids.".format(i=rec.seqid)) continue try: contig_cov = cov_map[rec.seqid] except KeyError: contig_cov = ContigCoverage(rec.seqid, _get_name(rec.seqid)) cov_map[rec.seqid] = contig_cov contig_cov.add_data(rec) reader.close() return cov_map
def _get_contigs_to_plot(alignment_summ_gff, contigs): """ Returns a dict (string: ContigCoverage) that maps a contig ID to its coverage object. :param alignment_summ_gff: (str) path to alignment_summ_gff :param contigs: (list) top contigs from reference """ def _get_name(id_): for c in contigs: if c.id == id_: return c.name cov_map = {} contig_ids = [c.id for c in contigs] reader = GffReader(alignment_summ_gff) for rec in reader: if rec.seqid not in contig_ids: log.info("Skipping seqid '{i}'.".format(i=rec.seqid)) continue try: contig_cov = cov_map[rec.seqid] except KeyError: contig_cov = ContigCoverage(rec.seqid, _get_name(rec.seqid)) cov_map[rec.seqid] = contig_cov contig_cov.add_data(rec) reader.close() return cov_map
def setUpClass(cls): super(TestModificationsOutput, cls).setUpClass() datastore = DataStore.from_job_path(cls.job_dir) entrypoints = EntryPoints.from_job_path(cls.job_dir) cls.h5_file = None cls.bw_file = None cls.gff_file = None for file_id, file_info in datastore.get_file_dict().iteritems(): if file_info.is_chunked: continue if file_info.file_type_id == FileTypes.GFF.file_type_id: with GffReader(file_info.path) as gff: for header in gff.headers: if header.startswith("##source ipdSummary"): cls.gff_file = file_info.path elif file_info.file_type_id == FileTypes.H5.file_type_id: cls.h5_file = file_info.path elif file_info.file_type_id == FileTypes.BIGWIG.file_type_id: cls.bw_file = file_info.path with GffReader(cls.gff_file) as gff: cls.gff_records = [rec for rec in gff] cls.gff_dict = {} for rec in cls.gff_records: cls.gff_dict[(rec.seqid, rec.start, rec.strand)] = rec ref = entrypoints.data['eid_ref_dataset'] cls.seqids = [] with ReferenceSet(ref) as rs: for i_ref, ctg in enumerate(rs): cls.seqids.append(ctg.id)
def _get_contig_coverage(alignment_summ_gff, contigs): """ Modifies the passed contigs object to include coverage information. :param alignment_summ_gff: (str) path to alignment_summ_gff :param contigs: (dict) contig id -> ContigInfo object """ reader = GffReader(alignment_summ_gff) for rec in reader: # Some contigs don't have any coverage, but make it into the gff file if rec.seqid in contigs: contigs[rec.seqid].add_coverage_data(rec) reader.close()
def run(self): with GffReader(self.gffFile) as reader, \ VcfWriter(sys.stdout) as writer: self._writeMetaData(writer) for gff in reader: vcf = VcfRecord.fromVariantGffRecord(gff) writer.writeRecord(vcf)
def test_merge_gffs_sorted(self): gff_out = "tmp_pbcore_merged_sorted.gff" merge_gffs_sorted(self.files, gff_out) with GffReader(gff_out) as f: start = [(rec.seqid, rec.start) for rec in f] self.assertEqual(start, self.sorted_start) rm_out(gff_out)
def test_merge_gffs_sorted(self): gff_out = "tmp_pbcore_merged_sorted.gff" merge_gffs_sorted(self.files, gff_out) with GffReader(gff_out) as f: start = [(rec.seqid, rec.start) for rec in f] assert start == self.SORTED_START rm_out(gff_out)
def test_merge_gffs(self): gff_out = "tmp_pbcore_merge.gff" merge_gffs(self.files, gff_out) n_rec = 0 for fn in self.files: with GffReader(fn) as f: n_rec += len([rec for rec in f]) with GffReader(gff_out) as f: assert f.headers == [ "##gff-version 3", "##source ipdSummary", "##sequence-region lambda_NEB3011 1 48502", ] n_rec_merged = len([rec for rec in f]) assert n_rec == n_rec_merged rm_out(gff_out)
def _queueChunksForReference(self): # Read in motif_summary.csv motifInfo = {} reader = csv.reader(open(self.args.motif_summary, 'r'), delimiter=',') reader.next() if self.options.oldData: col = 1 else: col = 2 for row in reader: motifInfo[row[0]] = row[col] # Figure out the length of the motifs file: motReader = GffReader(self.args.motifs) if self.options.undetected: motifDicts = [{ "seqID": x.seqid, "type": x.type, "score": x.score, "pos": x.start, "strand": x.strand, "attributes": x.attributes } for x in motReader if x.type == '.'] else: motifDicts = [{ "seqID": x.seqid, "type": x.type, "score": x.score, "pos": x.start, "strand": x.strand, "attributes": x.attributes } for x in motReader] refLength = len(motifDicts) # Maximum number of hits per chunk MAX_HITS = 500 nBases = min(refLength, self.args.maxLength) nBlocks = max(self.options.numWorkers * 4, nBases / MAX_HITS) # Block layout blockSize = min(nBases, max(nBases / nBlocks + 1, 100)) blockStarts = np.arange(0, nBases, step=blockSize) blockEnds = blockStarts + blockSize blocks = zip(blockStarts, blockEnds) if self.options.undetected: self.options.modifications = None # Queue up work blocks for block in blocks: # NOTE! The format of a work chunk is (refId <int>, refStartBase <int>, refEndBase <int>) # chunk = (refInfoId, block[0], block[1]) # chunk = (self.options.motifs, self.refInfo, motifInfo, self.options.modifications, self.options.undetected, self.options.oldData, block[0], block[1]) chunk = (motifDicts[block[0]:block[1]], self.refInfo, motifInfo, self.options.modifications, self.options.undetected, self.options.oldData, block[0], block[1]) self._workQueue.put((self.workChunkCounter, chunk)) self.workChunkCounter += 1
def find_top(self): """Sorting strategy here is to sort sublists and truncate down to list size as we go. So, for a 10000 line file, 1000 batchSortSize, and 100 final (howMany), iterate through the file, and sort each 1000 chunk, take the top 100""" reader = None with GffReader(self._variantsGff) as reader: locallist = [] count = 0 for gff3Record in reader: if count == self._batchSortSize: locallist = self._sortAndTrim(locallist) count = 0 locallist.append(Variant(gff3Record)) count = count + 1 if count == 0: return [] finalList = self._sortAndTrim(locallist) self._addContigNames(finalList) return finalList
class TestGffReader: RAWFILE = open(data.getGff3()) READER = GffReader(data.getGff3()) def test_headers(self): assert [ "##gff-version 3", "##pacbio-variant-version 2.1", "##date Sat Mar 22 12:16:13 2014", "##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.12", "##source GenomicConsensus 0.8.0", "##source-commandline /Users/dalexander/.virtualenvs/VE/bin/variantCaller.py --algorithm=plurality -q20 -x5 pbcore/data/aligned_reads_1.cmp.h5 -r /Users/dalexander/Data/lambdaNEB.fa -o /tmp/v.gff", "##source-alignment-file /Users/dalexander/Dropbox/Sources/git/pbcore/pbcore/data/aligned_reads_1.cmp.h5", "##source-reference-file /Users/dalexander/Data/lambdaNEB.fa", "##sequence-region lambda_NEB3011 1 48502" ] == self.READER.headers def test__iter__(self): records = list(self.READER) rawLines = self.RAWFILE.readlines()[9:] for record, rawLine in zip(records, rawLines): # No newlines or whitespace allowed in records assert str(record).strip() == str(record) # Make sure record matches line assert rawLine.strip() == str(record)
def _mainLoop(self): # Read in the existing modifications.gff modReader = GffReader(self.args.modifications) # Set up some additional headers to be injected headers = [ ('source', 'kineticModificationCaller 1.3.1'), ('source-commandline', " ".join(sys.argv)), ('attribute-description', 'modsfwd - count of detected DNA modifications on forward strand'), ('attribute-description', 'modsrev - count of detected DNA modifications on reverse strand') ] # Get modification calls hits = [{"pos": x.start, "strand": x.strand} for x in modReader if x.type == 'modified_base'] # Summary reader summaryFile = file(self.args.alignmentSummary) # Modified gff file summaryWriter = file(self.args.outfile, "w") self.seqMap = {} inHeader = True # Loop through for line in summaryFile: # Pass any metadata line straight through if line[0] == "#": # Parse headers splitFields = line.replace('#', '').split(' ') field = splitFields[0] value = " ".join(splitFields[1:]) if field == 'sequence-header': [internalTag, delim, externalTag] = value.strip().partition(' ') self.seqMap[internalTag] = externalTag print >>summaryWriter, line.strip() continue if inHeader: # We are at the end of the header -- write the tool-specific headers for field in headers: print >>summaryWriter, ("##%s %s" % field) inHeader = False # Parse the line rec = Gff3Record.fromString(line) if rec.type == 'region': # Get the hits in this interval, add them to the gff record intervalHits = [h for h in hits if rec.start <= h['pos'] <= rec.end] strand0Hits = len([h for h in intervalHits if h['strand'] == '+']) strand1Hits = len([h for h in intervalHits if h['strand'] == '-']) rec.modsfwd = strand0Hits rec.modsrev = strand1Hits print >>summaryWriter, str(rec)
def _extract_alignment_summ_data(aln_summ_gff, contigs): """ :param aln_summ_gff: (str) path to alignment_summary.gff :param contigs: (list) top contigs from reference :returns: 2 dictionaries containing data extracted from alignment_summary.gff """ def _get_name(id_): for c in contigs: if c.id == id_: return c.name contig_ids = [c.id for c in contigs] ref_data = {} var_map = {} log.info("Reading GFF data from {f}".format(f=aln_summ_gff)) reader = GffReader(aln_summ_gff) for rec in reader: seqid = rec.seqid.split()[0] if seqid not in contig_ids: continue # first data set ref_data.setdefault(seqid, [0, 0, 0, 0]) ref_data[seqid][LENGTH] = max(rec.end, ref_data[seqid][LENGTH]) numGaps, lenGaps = rec.attributes["gaps"].split(",") ref_data[seqid][GAPS] += int(lenGaps) ref_data[seqid][COV] += float( rec.attributes["cov2"].split(",")[0] ) * \ (rec.end - rec.start + 1) # second data set contig_var = None try: contig_var = var_map[seqid] except KeyError: contig_var = ContigVariants(seqid, _get_name(seqid)) var_map[seqid] = contig_var contig_var.add_data(rec) reader.close() return ref_data, var_map
def test_gff_file_headers(self): """ Check that every GFF file contains headers. """ for gff_file in self.gff_files: with GffReader(gff_file) as r: self.assertTrue( len(r.headers) > 0, "No headers in %s" % gff_file)
def test_reprocessed_gff_has_motifs(self): if len(self.motif_records) == 0: raise SkipTests("No motifs found, so none expected in GFF") n_motif_annotations = 0 with GffReader(self.motifs_gff) as gff: for rec in gff: if "motif" in rec.attributes: n_motif_annotations += 1 log.info("Found {n} annotations".format(n=n_motif_annotations)) self.assertTrue(n_motif_annotations > 0, "No motif annotations found in reprocessed GFF")
def test_gff_seqid_is_fasta_identifier(self): """ Check that GFF files use only the identifier part of FASTA headers, no spaces allowed - see ticket 28667 """ for gff_file in self.gff_files: with GffReader(gff_file) as r: for rec in r: self.assertTrue( not " " in rec.seqid, "seqid contains spaces:\n%s\n(file: %s)" % (str(rec), gff_file))
def _append_variants_gff_data(ref_data, variants_gff): """ Adds data from variants gff to the ref_data dict :param ref_data: (dict) dict of data pulled from alignment_summary.gff :param variants_gff: (str) path to variants_gff :type variants_gff: str """ reader = GffReader(variants_gff) for record in reader: err_len = record.end - record.start + 1 seqid = record.seqid.split()[0] if seqid in ref_data: ref_data[seqid][ERR] += err_len else: # the variants might not be present in the top 25 contigs, # so we can just raise a warning in the log. msg = "Unable to find {r} in {f}".format(r=seqid, f=variants_gff) log.warn(msg) reader.close()
def run(self): with GffReader(self.gffFile) as reader, \ BedWriter(sys.stdout) as writer: writer.writeHeader(self.options.name, self.options.description, self.options.useScore) for gff in reader: if self.purpose == 'coverage': bedRecord = CoverageBedRecord.fromAlignmentSummaryGffRecord( gff) else: bedRecord = VariantsBedRecord.fromVariantGffRecord(gff) writer.writeRecord(bedRecord)
def getMetrics(cls): cls.consensus_summary_gff = cls.coverage_summary_gff = None for file_id, file_info in cls.datastore.get_file_dict().iteritems(): if file_info.file_type_id == FileTypes.GFF.file_type_id: if "summarize_consensus" in file_info.file_id: cls.consensus_summary_gff = file_info.path elif "summarize_coverage" in file_info.file_id: cls.coverage_summary_gff = file_info.path cls.consensus_records = [] cls.coverage_records = [] if cls.consensus_summary_gff is not None: for MID in cls.METRIC_IDS: cls.metric_dict[MID] = 0 with GffReader(cls.consensus_summary_gff) as f: for rec in f: cls.consensus_records.append(rec) a = rec.attributes cls.metric_dict["n_deletions"] += int(a["del"]) cls.metric_dict["n_insertions"] += int(a["ins"]) cls.metric_dict["n_substitutions"] += int(a["sub"]) with GffReader(cls.coverage_summary_gff) as f: cls.coverage_records.extend([rec for rec in f])
def _append_variants_gff_data(ref_data, variants_gff): """ Adds data from variants gff to the ref_data dict :param ref_data: (dict) dict of data pulled from alignment_summary.gff :param variants_gff: (str) path to variants_gff :type variants_gff: str """ reader = GffReader(variants_gff) for record in reader: err_len = record.end - record.start + 1 seqid = record.seqid.split()[0] if seqid in ref_data: ref_data[seqid][ERR] += err_len else: # the variants might not be present in the top 25 contigs, # so we can just raise a warning in the log. msg = "Unable to find {r} in {f}".format( r=seqid, f=variants_gff) log.warn(msg) reader.close()
def setUpClass(cls): with FastaWriter(cls.REFERENCE) as fasta_out: with FastaReader(TestCoverageRpt.REFERENCE) as fasta_in: for rec in fasta_in: header = rec.id + "|quiver" fasta_out.writeRecord(header, rec.sequence) with GffWriter(cls.GFF) as gff_out: with GffReader(TestCoverageRpt.GFF) as gff_in: for header in gff_in.headers: gff_out.writeHeader(header) for rec in gff_in: rec.seqid += "|quiver" gff_out.writeRecord(rec)
def test_gff_sort_order(self): """ Check that records in all GFF output files are in sorted order (verification for bug 27785). """ for gff_file in self.gff_files: with GffReader(gff_file) as gff: last_rec = None for rec in gff: if last_rec is not None and rec.seqid == last_rec.seqid: self.assertTrue( rec.start >= last_rec.start, "Records occur out of order:\n{l}\n{r}".format( r=rec, l=last_rec)) last_rec = rec
def _mainLoop(self): # Read in the existing modifications.gff modReader = GffReader(self.modifications) headerString = ",".join( ['"' + x + '"' for x in self.knownModificationEvents]) # Set up some additional headers to be injected headers = [ ('source', 'kineticModificationCaller 1.3.3'), ('source-commandline', " ".join(sys.argv)), ('attribute-description', 'modsfwd - count of detected DNA modifications on forward strand by modification event type' ), ('attribute-description', 'modsrev - count of detected DNA modifications on reverse strand by modification event type' ), ('region-modsfwd', headerString), ('region-modsfwd', headerString) ] hitsByEvent = dict([(x, []) for x in self.knownModificationEvents]) # Get modification calls hits = [{ "pos": x.start, "strand": x.strand, "seqid": x.seqid, "type": x.type } for x in modReader if x.type in self.knownModificationEvents] # Summary reader summaryFile = file(self.alignmentSummary) # Modified gff file summaryWriter = file(self.outfile, "w") self.seqMap = {} inHeader = True # Loop through for line in summaryFile: # Pass any metadata line straight through if line[0] == "#": # Parse headers splitFields = line.replace('#', '').split(' ') field = splitFields[0] value = " ".join(splitFields[1:]) if field == 'sequence-header': [internalTag, delim, externalTag] = value.strip().partition(' ') self.seqMap[internalTag] = externalTag print(line.strip(), file=summaryWriter) continue if inHeader: # We are at the end of the header -- write the tool-specific headers for field in headers: print(("##%s %s" % field), file=summaryWriter) inHeader = False # Parse the line rec = Gff3Record.fromString(line) if rec.type == 'region': # Get the hits in this interval, add them to the gff record intervalHits = [ h for h in hits if rec.start <= h['pos'] <= rec.end and rec.seqid == h['seqid'] ] cFwd = self.countModificationTypes( [h for h in intervalHits if h['strand'] == '+']) cRev = self.countModificationTypes( [h for h in intervalHits if h['strand'] == '-']) rec.modsfwd = ",".join( [str(cFwd[x]) for x in self.knownModificationEvents]) rec.modsrev = ",".join( [str(cRev[x]) for x in self.knownModificationEvents]) print(str(rec), file=summaryWriter) return 0
def test_sort_gff(self): gff_out = sort_gff(self.combined) with GffReader(gff_out) as f: start = [(rec.seqid, rec.start) for rec in f] assert start == self.SORTED_START rm_out(gff_out)
def setup(self): self.rawFile = open(data.getGff3()) self.reader = GffReader(data.getGff3())
def main(): headers = [ ("source", "GenomicConsensus %s" % __VERSION__), ("pacbio-alignment-summary-version", "0.6"), ("source-commandline", " ".join(sys.argv)), ] desc = "Augment the alignment_summary.gff file with consensus and variants information." parser = argparse.ArgumentParser(description=desc) parser.add_argument("--variantsGff", type=str, help="Input variants.gff or variants.gff.gz filename", required=True) parser.add_argument("--output", "-o", type=str, help="Output alignment_summary.gff filename") parser.add_argument("inputAlignmentSummaryGff", type=str, help="Input alignment_summary.gff filename") options = parser.parse_args() inputVariantsGff = GffReader(options.variantsGff) inputAlignmentSummaryGff = GffReader(options.inputAlignmentSummaryGff) summaries = {} for gffRecord in inputAlignmentSummaryGff: region = Region(gffRecord.seqid, gffRecord.start, gffRecord.end) summaries[region] = {"ins": 0, "del": 0, "sub": 0, "cQv": (0, 0, 0)} inputAlignmentSummaryGff.close() counterNames = { "insertion": "ins", "deletion": "del", "substitution": "sub" } for variantGffRecord in inputVariantsGff: for region in summaries: summary = summaries[region] if (region.seqid == variantGffRecord.seqid and region.start <= variantGffRecord.start <= region.end): counterName = counterNames[variantGffRecord.type] variantLength = max(len(variantGffRecord.reference), len(variantGffRecord.variantSeq)) summary[counterName] += variantLength # TODO: base consensusQV on effective coverage summary["cQv"] = (20, 20, 20) inputAlignmentSummaryGff = open(options.inputAlignmentSummaryGff) outputAlignmentSummaryGff = open(options.output, "w") inHeader = True for line in inputAlignmentSummaryGff: line = line.rstrip() # Pass any metadata line straight through if line[0] == "#": print >> outputAlignmentSummaryGff, line.strip() continue if inHeader: # We are at the end of the header -- write the tool-specific headers for k, v in headers: print >> outputAlignmentSummaryGff, ("##%s %s" % (k, v)) inHeader = False # Parse the line rec = Gff3Record.fromString(line) if rec.type == "region": summary = summaries[(rec.seqid, rec.start, rec.end)] if "cQv" in summary: cQvTuple = summary["cQv"] line += ";%s=%s" % ("cQv", ",".join( str(int(f)) for f in cQvTuple)) for counterName in counterNames.values(): if counterName in summary: line += ";%s=%d" % (counterName, summary[counterName]) print >> outputAlignmentSummaryGff, line
def test_sort_gff(self): gff_out = sort_gff(self.combined) with GffReader(gff_out) as f: start = [(rec.seqid, rec.start) for rec in f] self.assertEqual(start, self.sorted_start) rm_out(gff_out)
def onChunk(self, referenceWindow): (motifDicts, refInfo, motifInfo, modifFile, undetectedOnly, oldData, start, end) = referenceWindow print "Start = ", start, " End = ", end # Read in the modifications GFF if needed: if not undetectedOnly: modReader = GffReader(modifFile) modifDicts = [{ "seqID": x.seqid, "type": x.type, "score": x.score, "pos": x.start, "strand": x.strand, "attributes": x.attributes } for x in modReader] # To help find the correct reference for an entry in motifs.gff self.selectRef = [x.FullName for x in refInfo] # Loop through the rows of the motifs GFF file: self.pre = self.ipdModel.gbmModel.post self.post = self.ipdModel.gbmModel.pre collectResults = [] modificationsLinecount = 0 for d in motifDicts: if d["type"] != '.' and undetectedOnly: # Go on to the next row continue ref = refInfo[self.getReferenceIndex(d, oldData)] self.refId = ref.ID k = self.fillOutEasyInformation(d, motifInfo) if not "motif" in k.keys(): # If no motif is listed in this row, go on to the next row continue if d["type"] != '.' and not undetectedOnly: # Search sorted list of template positions in modifications GFF for a match: for y in modifDicts[modificationsLinecount:]: if y["pos"] == d["pos"] and y["strand"] == d[ "strand"] and y["seqID"] == d["seqID"]: break modificationsLinecount += 1 # Once the match is found, copy in the modified fraction estimate if modificationsLinecount <= len(modifDicts): u = y["attributes"] if FRAC in u: k[FRAC] = float(u[FRAC]) k[FRAClow] = float(u[FRAClow]) k[FRACup] = float(u[FRACup]) if d["type"] == '.': # See update to ResultsWriter: 'nMd' is 'not modified' k['modification'] = 'nMd' # Figure out modification type: if oldData: self.modificationType = self.oldDataModificationType( motifInfo) else: self.modificationType = motifInfo[k['motif']] # Select a window around the current position to use for estimation stop = k["tpl"] + self.post start = min(max(1, (k["tpl"] - self.pre)), stop) # Trim end coordinate to length of current template # end = min(end, self.ipdModel.refLength(self.refId)) # Try to estimate the modified fraction: if self.modificationType == 'modified_base': # In this case, we'll need the mean Ipd function: self.meanIpdFunc = self.ipdModel.predictIpdFunc(self.refId) self.strand = k['strand'] perSiteResults = self._summarizeReferenceRegion((start, stop)) if self.modificationType == 'modified_base': k[FRAC] = perSiteResults[self.post - 1][FRAC] k[FRAClow] = perSiteResults[self.post - 1][FRAClow] k[FRACup] = perSiteResults[self.post - 1][FRACup] else: mods = self._decodePositiveControl(perSiteResults, (start, stop)) k[FRAC] = mods[0] k[FRAClow] = mods[1] k[FRACup] = mods[2] collectResults.append(k) return collectResults