def _get_contigs_to_plot(alignment_summ_gff, contigs): """ Returns a dict (string: ContigCoverage) that maps a contig ID to its coverage object. :param alignment_summ_gff: (str) path to alignment_summ_gff :param contigs: (list) top contigs from reference """ def _get_name(id_): for c in contigs: if c.id == id_: return c.name cov_map = {} contig_ids = [c.id for c in contigs] reader = GffReader(alignment_summ_gff) for rec in reader: if rec.seqid not in contig_ids: log.info("Skipping seqid '{i}'.".format(i=rec.seqid)) continue try: contig_cov = cov_map[rec.seqid] except KeyError: contig_cov = ContigCoverage(rec.seqid, _get_name(rec.seqid)) cov_map[rec.seqid] = contig_cov contig_cov.add_data(rec) reader.close() return cov_map
def _get_contigs_to_plot(alignment_summ_gff, contigs): """ Returns a dict (string: ContigCoverage) that maps a contig ID to its coverage object. :param alignment_summ_gff: (str) path to alignment_summ_gff :param contigs: (list) top contigs from reference """ def _get_name(id_): for c in contigs: if c.id == id_: return c.name cov_map = {} contig_ids = [c.id for c in contigs] reader = GffReader(alignment_summ_gff) for rec in reader: if rec.seqid not in contig_ids: log.info( "Unable to find gff '{i}' in alignment contig ids.".format(i=rec.seqid)) continue try: contig_cov = cov_map[rec.seqid] except KeyError: contig_cov = ContigCoverage(rec.seqid, _get_name(rec.seqid)) cov_map[rec.seqid] = contig_cov contig_cov.add_data(rec) reader.close() return cov_map
def _get_contig_coverage(alignment_summ_gff, contigs): """ Modifies the passed contigs object to include coverage information. :param alignment_summ_gff: (str) path to alignment_summ_gff :param contigs: (dict) contig id -> ContigInfo object """ reader = GffReader(alignment_summ_gff) for rec in reader: # Some contigs don't have any coverage, but make it into the gff file if rec.seqid in contigs: contigs[rec.seqid].add_coverage_data(rec) reader.close()
def _extract_alignment_summ_data(aln_summ_gff, contigs): """ :param aln_summ_gff: (str) path to alignment_summary.gff :param contigs: (list) top contigs from reference :returns: 2 dictionaries containing data extracted from alignment_summary.gff """ def _get_name(id_): for c in contigs: if c.id == id_: return c.name contig_ids = [c.id for c in contigs] ref_data = {} var_map = {} log.info("Reading GFF data from {f}".format(f=aln_summ_gff)) reader = GffReader(aln_summ_gff) for rec in reader: seqid = rec.seqid.split()[0] if seqid not in contig_ids: continue # first data set ref_data.setdefault(seqid, [0, 0, 0, 0]) ref_data[seqid][LENGTH] = max(rec.end, ref_data[seqid][LENGTH]) numGaps, lenGaps = rec.attributes["gaps"].split(",") ref_data[seqid][GAPS] += int(lenGaps) ref_data[seqid][COV] += float( rec.attributes["cov2"].split(",")[0] ) * \ (rec.end - rec.start + 1) # second data set contig_var = None try: contig_var = var_map[seqid] except KeyError: contig_var = ContigVariants(seqid, _get_name(seqid)) var_map[seqid] = contig_var contig_var.add_data(rec) reader.close() return ref_data, var_map
def _append_variants_gff_data(ref_data, variants_gff): """ Adds data from variants gff to the ref_data dict :param ref_data: (dict) dict of data pulled from alignment_summary.gff :param variants_gff: (str) path to variants_gff :type variants_gff: str """ reader = GffReader(variants_gff) for record in reader: err_len = record.end - record.start + 1 seqid = record.seqid.split()[0] if seqid in ref_data: ref_data[seqid][ERR] += err_len else: # the variants might not be present in the top 25 contigs, # so we can just raise a warning in the log. msg = "Unable to find {r} in {f}".format(r=seqid, f=variants_gff) log.warn(msg) reader.close()
def _append_variants_gff_data(ref_data, variants_gff): """ Adds data from variants gff to the ref_data dict :param ref_data: (dict) dict of data pulled from alignment_summary.gff :param variants_gff: (str) path to variants_gff :type variants_gff: str """ reader = GffReader(variants_gff) for record in reader: err_len = record.end - record.start + 1 seqid = record.seqid.split()[0] if seqid in ref_data: ref_data[seqid][ERR] += err_len else: # the variants might not be present in the top 25 contigs, # so we can just raise a warning in the log. msg = "Unable to find {r} in {f}".format( r=seqid, f=variants_gff) log.warn(msg) reader.close()
def main(): headers = [ ("source", "GenomicConsensus %s" % __VERSION__), ("pacbio-alignment-summary-version", "0.6"), ("source-commandline", " ".join(sys.argv)), ] desc = "Augment the alignment_summary.gff file with consensus and variants information." parser = argparse.ArgumentParser(description=desc) parser.add_argument("--variantsGff", type=str, help="Input variants.gff or variants.gff.gz filename", required=True) parser.add_argument("--output", "-o", type=str, help="Output alignment_summary.gff filename") parser.add_argument("inputAlignmentSummaryGff", type=str, help="Input alignment_summary.gff filename") options = parser.parse_args() inputVariantsGff = GffReader(options.variantsGff) inputAlignmentSummaryGff = GffReader(options.inputAlignmentSummaryGff) summaries = {} for gffRecord in inputAlignmentSummaryGff: region = Region(gffRecord.seqid, gffRecord.start, gffRecord.end) summaries[region] = {"ins": 0, "del": 0, "sub": 0, "cQv": (0, 0, 0)} inputAlignmentSummaryGff.close() counterNames = { "insertion": "ins", "deletion": "del", "substitution": "sub" } for variantGffRecord in inputVariantsGff: for region in summaries: summary = summaries[region] if (region.seqid == variantGffRecord.seqid and region.start <= variantGffRecord.start <= region.end): counterName = counterNames[variantGffRecord.type] variantLength = max(len(variantGffRecord.reference), len(variantGffRecord.variantSeq)) summary[counterName] += variantLength # TODO: base consensusQV on effective coverage summary["cQv"] = (20, 20, 20) inputAlignmentSummaryGff = open(options.inputAlignmentSummaryGff) outputAlignmentSummaryGff = open(options.output, "w") inHeader = True for line in inputAlignmentSummaryGff: line = line.rstrip() # Pass any metadata line straight through if line[0] == "#": print >> outputAlignmentSummaryGff, line.strip() continue if inHeader: # We are at the end of the header -- write the tool-specific headers for k, v in headers: print >> outputAlignmentSummaryGff, ("##%s %s" % (k, v)) inHeader = False # Parse the line rec = Gff3Record.fromString(line) if rec.type == "region": summary = summaries[(rec.seqid, rec.start, rec.end)] if "cQv" in summary: cQvTuple = summary["cQv"] line += ";%s=%s" % ("cQv", ",".join( str(int(f)) for f in cQvTuple)) for counterName in counterNames.values(): if counterName in summary: line += ";%s=%d" % (counterName, summary[counterName]) print >> outputAlignmentSummaryGff, line