def cli(ctx, bam, reference, genes_file, output): rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # Create a MappedReadCollection object mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam)) # Parse the genes from the gene file genes = parse_genes_file(genes_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]["frame"]) # Create an AACensus object aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) if output: output.write(aa_census.coverage(frames)) output.close() else: click.echo(aa_census.coverage(frames))
def cli(ctx, bam, reference, bed4_file, output): """This script builds an amino acid census and returns its coverage. The BAM alignment file corresponds to a pileup of sequences aligned to the REFERENCE. A BAM index file (.bai) must also be present and, except for the extension, have the same name as the BAM file. The REFERENCE must be in FASTA format. The BED4_FILE must be a BED file with at least 4 columns and specify the gene locations within the REFERENCE. The output is in CSV format.""" rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # Create a MappedReadCollection object mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam)) # Parse the genes from the gene file genes = parse_BED4_file(bed4_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]["frame"]) # Create an AACensus object aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) if output: output.write(aa_census.coverage(frames)) output.close() else: click.echo(aa_census.coverage(frames))
class TestMappedRead: @classmethod def setup_class(self): reference = TEST_PATH + "/data/hxb2_pol.fas" bam = TEST_PATH + "/data/align.bam" BED4_file = TEST_PATH + "/data/hxb2_pol.bed" rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # create MappedReadCollection object mapped_read_collection_arr.append( parse_mapped_reads_from_bam(r, bam)) genes = parse_BED4_file(BED4_file, rs[0].name) # Determine which frames our genes are in self.frames = set() for gene in genes: self.frames.add(genes[gene]["frame"]) self.aa_census = AACensus(reference, mapped_read_collection_arr, genes, self.frames) def test_coverage(self): with open(VALID_COVERAGE_CSV, "r") as input: coverage = input.read() assert self.aa_census.coverage(self.frames) == coverage
def setup_class(self): reference = TEST_PATH + "/data/hxb2_pol.fas" bam = TEST_PATH + "/data/align.bam" BED4_file = TEST_PATH + "/data/hxb2_pol.bed" rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # create MappedReadCollection object mapped_read_collection_arr.append( parse_mapped_reads_from_bam(r, bam)) genes = parse_BED4_file(BED4_file, rs[0].name) # Determine which frames our genes are in self.frames = set() for gene in genes: self.frames.add(genes[gene]["frame"]) self.aa_census = AACensus(reference, mapped_read_collection_arr, genes, self.frames)
def test_from_aacensus(self): bam = TEST_PATH + "/data/align.bam" BED4_file = TEST_PATH + "/data/hxb2_pol.bed" mapped_read_collection_arr = [] error_rate = 0.0038 # Create a MappedReadCollection object for r in self.references: mapped_read_collection_arr.append( parse_mapped_reads_from_bam(r, bam)) variants = NTVariantCollection.from_mapped_read_collections( error_rate, self.references, *mapped_read_collection_arr) variants.filter('q30', 'QUAL<30', True) variants.filter('ac5', 'AC<5', True) variants.filter('dp100', 'DP<100', True) # Mask the unconfident differences for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants) # Parse the genes from the gene file genes = parse_BED4_file(BED4_file, self.references[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]['frame']) aa_census = AACensus(self.reference, mapped_read_collection_arr, genes, frames) test_variants = CodonVariantCollection.from_aacensus(aa_census) ref_seq = self.references[0].seq for gene in test_variants.variants: assert gene in genes for pos in test_variants.variants[gene]: for frame in frames: nt_pos = pos / 3 - frame assert nt_pos >= genes[gene]['start'] or nt_pos <= genes[ gene]['end'] for codon in test_variants.variants[gene][pos]: ref_codon = ref_seq[(pos):(pos) + 3].lower() assert codon != ref_codon
def cli(ctx, bam, reference, variants, bed4_file, min_freq, mutation_db, reporting_threshold, output): rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # Create a MappedReadCollection object mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam)) variants_obj = parse_nt_variants_from_vcf(variants, rs) # Mask the unconfident differences for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants_obj) # Parse the genes from the gene file genes = parse_BED4_file(bed4_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]['frame']) # Create an AACensus object aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) # Create AAVar collection and print the aavf file aa_vars = AAVariantCollection.from_aacensus(aa_census) # Filter for mutant frequency aa_vars.filter('mf' + str(min_freq), 'freq<' + str(min_freq), True) # Build the mutation database mutation_db = MutationDB(mutation_db, genes) # Generate the mutation report if output: output.write( aa_vars.report_dr_mutations(mutation_db, reporting_threshold)) output.close() else: click.echo( aa_vars.report_dr_mutations(mutation_db, reporting_threshold))
def aavar(bam, reference, variants, genes_file, min_freq, mutation_db, output): rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # Create a MappedReadCollection object mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam)) variants_obj = parse_nt_variants_from_vcf(variants, rs) # Mask the unconfident differences for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants_obj) # Parse the genes from the gene file genes = parse_genes_file(genes_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]['frame']) # Create an AACensus object aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) # Create AAVar collection and print the hmcf file aa_vars = AAVariantCollection.from_aacensus(aa_census) # Filter for mutant frequency aa_vars.filter('mf0.01', 'freq<0.01', True) # Build the mutation database and update collection if mutation_db is not None: mutation_db = MutationDB(mutation_db, genes) aa_vars.apply_mutation_db(mutation_db) if output: output.write(aa_vars.to_hmcf_file(CONFIDENT)) else: click.echo(aa_vars.to_hmcf_file(CONFIDENT))
def codonvar(bam, reference, offset, bed4_file, variants, error_rate, output): rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] # Create a MappedReadCollection object for r in rs: mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam)) if variants: variants_obj = parse_nt_variants_from_vcf(variants, rs) else: variants = NTVariantCollection.from_mapped_read_collections( error_rate, rs, *mapped_read_collection_arr) variants.filter('q30', 'QUAL<30', True) variants.filter('ac5', 'AC<5', True) variants.filter('dp100', 'DP<100', True) variants_obj = variants # Mask the unconfident differences for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants_obj) # Parse the genes from the gene file genes = parse_BED4_file(bed4_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]['frame']) aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) codon_variants = CodonVariantCollection.from_aacensus(aa_census) if output: output.write(codon_variants.to_csv_file(offset)) output.close() else: click.echo(codon_variants.to_csv_file(offset))
def setup(self): reference = TEST_PATH + "/data/hxb2_pol.fas" bam = TEST_PATH + "/data/align.bam" genes_file = TEST_PATH + "/data/hxb2_pol.bed" mutation_db = TEST_PATH + "/data/mutation_db.tsv" min_freq = 0.01 rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # Create a MappedReadCollection object mapped_read_collection_arr.append( parse_mapped_reads_from_bam(r, bam)) variants_obj = parse_nt_variants_from_vcf(VARIANTS_FILE, rs) # Mask the unconfident differences for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants_obj) # Parse the genes from the gene file genes = parse_genes_file(genes_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]['frame']) # Create an AACensus object aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) # Find the AA mutations self.aa_collection = AAVariantCollection.from_aacensus(aa_census) # Build the mutation database self.mutation_db = MutationDB(mutation_db, genes)
def setup(self): bam = TEST_PATH + "/data/align.bam" reference = TEST_PATH + "/data/hxb2_pol.fas" genes_file = TEST_PATH + "/data/hxb2_pol.bed" error_rate = 0.0038 rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] # Create a MappedReadCollection object for r in rs: mapped_read_collection_arr.append( parse_mapped_reads_from_bam(r, bam)) variants = NTVariantCollection.from_mapped_read_collections( error_rate, rs, *mapped_read_collection_arr) variants.filter('q30', 'QUAL<30', True) variants.filter('ac5', 'AC<5', True) variants.filter('dp100', 'DP<100', True) # Mask the unconfident differences for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants) # Parse the genes from the gene file genes = parse_genes_file(genes_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]['frame']) aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) self.codon_variants = CodonVariantCollection.from_aacensus(aa_census)
def aavar(bam, reference, bed4_file, variants, mutation_db, min_freq, error_rate, output): rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # Create a MappedReadCollection object mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam)) if variants: variants_obj = parse_nt_variants_from_vcf(variants, rs) else: variants = NTVariantCollection.from_mapped_read_collections( error_rate, rs, *mapped_read_collection_arr) variants.filter('q30', 'QUAL<30', True) variants.filter('ac5', 'AC<5', True) variants.filter('dp100', 'DP<100', True) variants_obj = variants # Mask the unconfident differences for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants_obj) # Parse the genes from the gene file genes = parse_BED4_file(bed4_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]['frame']) # Create an AACensus object aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) # Create AAVar collection and print the aavf file aa_vars = AAVariantCollection.from_aacensus(aa_census) # Filter for mutant frequency aa_vars.filter('mf0.01', 'freq<0.01', True) # Build the mutation database and update collection if mutation_db is not None: mutation_db = MutationDB(mutation_db, genes) aa_vars.apply_mutation_db(mutation_db) aavf_obj = aa_vars.to_aavf_obj("aavar", os.path.basename(reference), CONFIDENT) records = list(aavf_obj) if output: writer = parser.Writer(output, aavf_obj) else: writer = parser.Writer(sys.stdout, aavf_obj) for record in records: writer.write_record(record) if output: output.close writer.close()
def analyze_reads(self, fasta_id, variant_filters, reporting_threshold, generate_consensus): # Map reads against reference using bowtietwo if not self.quiet: print("# Mapping reads...") try: bam = self.generate_bam(fasta_id) except Exception as error: raise (error) if not self.quiet: print("# Loading read mappings...") # cmd_consensus if generate_consensus: cons_seq_file = open("%s/consensus.fasta" % self.output_dir, "w+") mapped_read_collection_arr = [] for r in self.references: mrc = parse_mapped_reads_from_bam(r, bam) mapped_read_collection_arr.append(mrc) consensus_seq = mrc.to_consensus(self.consensus_pct) if generate_consensus and len(consensus_seq) > 0: cons_seq_file.write('>{0}_{1}_{2}\n{3}'.format( fasta_id, reporting_threshold, r.name, consensus_seq)) if generate_consensus: cons_seq_file.close() # cmd_callntvar if not self.quiet: print("# Identifying variants...") variants = NTVariantCollection.from_mapped_read_collections( variant_filters[ERROR_RATE], self.references, *mapped_read_collection_arr) variants.filter('q%s' % variant_filters[MIN_VARIANT_QUAL], 'QUAL<%s' % variant_filters[MIN_VARIANT_QUAL], True) variants.filter('ac%s' % variant_filters[MIN_AC], 'AC<%s' % variant_filters[MIN_AC], True) variants.filter('dp%s' % variant_filters[MIN_DP], 'DP<%s' % variant_filters[MIN_DP], True) vcf_file = open("%s/hydra.vcf" % self.output_dir, "w+") vcf_file.write(variants.to_vcf_file()) vcf_file.close() # cmd_aa_census if not self.quiet: print("# Masking filtered variants...") for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants) if not self.quiet: print("# Building amino acid census...") # Determine which frames our genes are in frames = set() for gene in self.genes: frames.add(self.genes[gene]['frame']) aa_census = AACensus(self.reference, mapped_read_collection_arr, self.genes, frames) coverage_file = open("%s/coverage_file.csv" % self.output_dir, "w+") coverage_file.write(aa_census.coverage(frames)) coverage_file.close() # cmd_aavariants if not self.quiet: print("# Finding amino acid mutations...") # Create AAVar collection and print the aavf file aa_vars = AAVariantCollection.from_aacensus(aa_census) # Filter for mutant frequency aa_vars.filter('mf%s' % variant_filters[MIN_FREQ], 'freq<%s' % variant_filters[MIN_FREQ], True) # Build the mutation database and update collection if self.mutation_db is not None: mutation_db = MutationDB(self.mutation_db, self.genes) aa_vars.apply_mutation_db(mutation_db) aavf_obj = aa_vars.to_aavf_obj("hydra", os.path.basename(self.reference), CONFIDENT) records = list(aavf_obj) mut_report = open("%s/mutation_report.aavf" % self.output_dir, "w+") writer = parser.Writer(mut_report, aavf_obj) for record in records: writer.write_record(record) mut_report.close() writer.close() # cmd_drmutations if not self.quiet: print("# Writing drug resistant mutation report...") dr_report = open("%s/dr_report.csv" % self.output_dir, "w+") dr_report.write( aa_vars.report_dr_mutations(mutation_db, reporting_threshold)) dr_report.close() self.output_stats(mapped_read_collection_arr)
def analyze_reads(self, fasta_id, filters, reporting_threshold, generate_consensus): # Map reads against reference using bowtietwo if not self.quiet: print("# Mapping reads...") bam = self.generate_bam(fasta_id) if not self.quiet: print("# Loading read mappings...") # cmd_consensus if generate_consensus: cons_seq_file = open("%s/consensus.fasta" % self.output_dir, "w+") mapped_read_collection_arr = [] for r in self.references: mrc = parse_mapped_reads_from_bam(r, bam) mapped_read_collection_arr.append(mrc) if generate_consensus: cons_seq_file.write('>{0}_{1}_{2}\n{3}'.format( fasta_id, reporting_threshold, r.name, mrc.to_consensus(self.consensus_pct))) if generate_consensus: cons_seq_file.close() # cmd_callntvar if not self.quiet: print("# Identifying variants...") variants = NTVariantCollection.from_mapped_read_collections( filters["error_rate"], self.references, *mapped_read_collection_arr) variants.filter('q%s' % filters["min_qual"], 'QUAL<%s' % filters["min_qual"], True) variants.filter('ac%s' % filters["min_ac"], 'AC<%s' % filters["min_ac"], True) variants.filter('dp%s' % filters["min_dp"], 'DP<%s' % filters["min_dp"], True) vcf_file = open("%s/hydra.vcf" % self.output_dir, "w+") vcf_file.write(variants.to_vcf_file()) vcf_file.close() # cmd_aa_census if not self.quiet: print("# Masking filtered variants...") for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants) if not self.quiet: print("# Building amino acid census...") # Determine which frames our genes are in frames = set() for gene in self.genes: frames.add(self.genes[gene]['frame']) aa_census = AACensus(self.reference, mapped_read_collection_arr, self.genes, frames) coverage_file = open("%s/coverage_file.csv" % self.output_dir, "w+") coverage_file.write(aa_census.coverage(frames)) coverage_file.close() # cmd_aavariants if not self.quiet: print("# Finding amino acid mutations...") # Create AAVar collection and print the hmcf file aa_vars = AAVariantCollection.from_aacensus(aa_census) # Filter for mutant frequency aa_vars.filter('mf%s' % filters['min_freq'], 'freq<%s' % filters['min_freq'], True) # Build the mutation database and update collection if self.mutation_db is not None: mutation_db = MutationDB(self.mutation_db, self.genes) aa_vars.apply_mutation_db(mutation_db) mut_report = open("%s/mutation_report.hmcf" % self.output_dir, "w+") mut_report.write(aa_vars.to_hmcf_file(CONFIDENT)) mut_report.close() # cmd_drmutations if not self.quiet: print("# Writing drug resistant mutation report...") dr_report = open("%s/dr_report.csv" % self.output_dir, "w+") dr_report.write(aa_vars.report_dr_mutations(mutation_db, reporting_threshold)) dr_report.close() self.output_stats(mapped_read_collection_arr)