def __init__(self, id, output_dir, reads, reference, genes_file, mutation_db, quiet, consensus_pct): self.id = id self.output_dir = output_dir self.reads = reads self.reference = reference self.mutation_db = mutation_db self.genes_file = genes_file self.quiet = quiet self.consensus_pct = consensus_pct self.filtered = {} self.filtered["status"] = 0 self.filtered["length"] = 0 self.filtered["score"] = 0 self.filtered["ns"] = 0 self.input_size = 0 self.determine_input_size() self.references = parse_references_from_fasta(self.reference) self.genes = parse_genes_file(genes_file, self.references[0].name) self.filtered_reads = "%s/filtered.fastq" % output_dir if not os.path.isdir(output_dir): os.mkdir(output_dir)
def cli(ctx, bam, reference, genes_file, output): rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # Create a MappedReadCollection object mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam)) # Parse the genes from the gene file genes = parse_genes_file(genes_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]["frame"]) # Create an AACensus object aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) if output: output.write(aa_census.coverage(frames)) output.close() else: click.echo(aa_census.coverage(frames))
def setup_class(self): self.references = parse_references_from_fasta('tests/data/ref1.fasta') self.variant_collection = NTVariantCollection(self.references) self.variant_collection.variants['ref1']['3']['t'] = NTVariant( chrom='ref1', pos=3, ref='c', alt='t', qual=30, info={ 'DP': 400, 'AC': 12, 'AF': 0.03 }) self.variant_collection.variants['ref1']['10']['a'] = NTVariant( chrom='ref1', pos=10, ref='a', alt='t', qual=23, info={ 'DP': 200, 'AC': 7, 'AF': 0.035 })
def cli(ctx, bam, reference, bed4_file, output): """This script builds an amino acid census and returns its coverage. The BAM alignment file corresponds to a pileup of sequences aligned to the REFERENCE. A BAM index file (.bai) must also be present and, except for the extension, have the same name as the BAM file. The REFERENCE must be in FASTA format. The BED4_FILE must be a BED file with at least 4 columns and specify the gene locations within the REFERENCE. The output is in CSV format.""" rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # Create a MappedReadCollection object mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam)) # Parse the genes from the gene file genes = parse_BED4_file(bed4_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]["frame"]) # Create an AACensus object aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) if output: output.write(aa_census.coverage(frames)) output.close() else: click.echo(aa_census.coverage(frames))
def cli(ctx, bam, reference, percentage, id, output): rs = parse_references_from_fasta(reference) bam_header = pysam.Samfile(bam, "rb").header if id: fasta_id = id else: fasta_id = os.path.basename(bam).split('.')[0] for r in rs: mrc = parse_mapped_reads_from_bam(r, bam) conseq = mrc.to_consensus(percentage) if hasattr(bam_header, 'RG'): fasta_id = bam_header['RG'] if output: output.write('>{0}_{1}_{2}\n{3}'.format(fasta_id, percentage, r.name, conseq)) else: click.echo('>{0}_{1}_{2}\n{3}'.format(fasta_id, percentage, r.name, conseq)) if output: output.close()
def ntvar(bam, reference, error_rate, output): rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # create MappedReadCollection object mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam)) mapped_read_collection_arr = [] for r in rs: # create MappedReadCollection object mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam)) variants = NTVariantCollection.from_mapped_read_collections( error_rate, rs, *mapped_read_collection_arr) variants.filter('q30', 'QUAL<30', True) variants.filter('ac5', 'AC<5', True) variants.filter('dp100', 'DP<100', True) if output: output.write(variants.to_vcf_file()) output.close() else: click.echo(variants.to_vcf_file())
def setup(self): csv = TEST_PATH + "/data/output/mutant_types.csv" reference = TEST_PATH + "/data/hxb2_pol.fas" self.offset = 1269 rs = parse_references_from_fasta(reference) self.ref_seq = rs[0].seq self.codon_variants = parse_codon_variants(csv, rs)
def cli(ctx, csv, reference, offset, output): rs = parse_references_from_fasta(reference) ref_seq = rs[0].seq codon_variants = parse_codon_variants(csv, rs) if output: output.write(codon_variants.report_dnds_values(ref_seq, offset)) else: click.echo(codon_variants.report_dnds_values(ref_seq, offset))
def cli(ctx, bam, reference, variants, bed4_file, min_freq, mutation_db, reporting_threshold, output): rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # Create a MappedReadCollection object mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam)) variants_obj = parse_nt_variants_from_vcf(variants, rs) # Mask the unconfident differences for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants_obj) # Parse the genes from the gene file genes = parse_BED4_file(bed4_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]['frame']) # Create an AACensus object aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) # Create AAVar collection and print the aavf file aa_vars = AAVariantCollection.from_aacensus(aa_census) # Filter for mutant frequency aa_vars.filter('mf' + str(min_freq), 'freq<' + str(min_freq), True) # Build the mutation database mutation_db = MutationDB(mutation_db, genes) # Generate the mutation report if output: output.write( aa_vars.report_dr_mutations(mutation_db, reporting_threshold)) output.close() else: click.echo( aa_vars.report_dr_mutations(mutation_db, reporting_threshold))
def aavar(bam, reference, variants, genes_file, min_freq, mutation_db, output): rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # Create a MappedReadCollection object mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam)) variants_obj = parse_nt_variants_from_vcf(variants, rs) # Mask the unconfident differences for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants_obj) # Parse the genes from the gene file genes = parse_genes_file(genes_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]['frame']) # Create an AACensus object aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) # Create AAVar collection and print the hmcf file aa_vars = AAVariantCollection.from_aacensus(aa_census) # Filter for mutant frequency aa_vars.filter('mf0.01', 'freq<0.01', True) # Build the mutation database and update collection if mutation_db is not None: mutation_db = MutationDB(mutation_db, genes) aa_vars.apply_mutation_db(mutation_db) if output: output.write(aa_vars.to_hmcf_file(CONFIDENT)) else: click.echo(aa_vars.to_hmcf_file(CONFIDENT))
def codonvar(bam, reference, offset, bed4_file, variants, error_rate, output): rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] # Create a MappedReadCollection object for r in rs: mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam)) if variants: variants_obj = parse_nt_variants_from_vcf(variants, rs) else: variants = NTVariantCollection.from_mapped_read_collections( error_rate, rs, *mapped_read_collection_arr) variants.filter('q30', 'QUAL<30', True) variants.filter('ac5', 'AC<5', True) variants.filter('dp100', 'DP<100', True) variants_obj = variants # Mask the unconfident differences for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants_obj) # Parse the genes from the gene file genes = parse_BED4_file(bed4_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]['frame']) aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) codon_variants = CodonVariantCollection.from_aacensus(aa_census) if output: output.write(codon_variants.to_csv_file(offset)) output.close() else: click.echo(codon_variants.to_csv_file(offset))
def setup_class(self): self.reference = TEST_PATH + "/data/hxb2_pol.fas" self.references = parse_references_from_fasta(self.reference) self.variant_collection = CodonVariantCollection(self.references) self.offset = 1269 self.variant_collection.variants['gag']['3']['aTa'] = CodonVariant( chrom="hxb2_pol", pos=1, gene="gag", nt_start_gene=1309, nt_end_gene=2841, nt_start=2077, nt_end=2079, ref_codon="ata", mutant_codon="aTa", ref_aa="I", mutant_aa="K", coverage=563, mutant_freq=1.60, mutant_type="S", ns_count=1.0000, s_count=1.5000) self.variant_collection.variants['tat']['10']['aAa'] = CodonVariant( chrom="hxb2_pol", pos=2, gene="tat", nt_start_gene=3309, nt_end_gene=4841, nt_start=4000, nt_end=4002, ref_codon="ata", mutant_codon="aAa", ref_aa="I", mutant_aa="K", coverage=563, mutant_freq=1.60, mutant_type="S", ns_count=1.0000, s_count=1.5000)
def setup(self): reference = TEST_PATH + "/data/hxb2_pol.fas" bam = TEST_PATH + "/data/align.bam" genes_file = TEST_PATH + "/data/hxb2_pol.bed" mutation_db = TEST_PATH + "/data/mutation_db.tsv" min_freq = 0.01 rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # Create a MappedReadCollection object mapped_read_collection_arr.append( parse_mapped_reads_from_bam(r, bam)) variants_obj = parse_nt_variants_from_vcf(VARIANTS_FILE, rs) # Mask the unconfident differences for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants_obj) # Parse the genes from the gene file genes = parse_genes_file(genes_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]['frame']) # Create an AACensus object aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) # Find the AA mutations self.aa_collection = AAVariantCollection.from_aacensus(aa_census) # Build the mutation database self.mutation_db = MutationDB(mutation_db, genes)
def setup_class(self): reference = TEST_PATH + "/data/hxb2_pol.fas" bam = TEST_PATH + "/data/align.bam" BED4_file = TEST_PATH + "/data/hxb2_pol.bed" rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # create MappedReadCollection object mapped_read_collection_arr.append( parse_mapped_reads_from_bam(r, bam)) genes = parse_BED4_file(BED4_file, rs[0].name) # Determine which frames our genes are in self.frames = set() for gene in genes: self.frames.add(genes[gene]["frame"]) self.aa_census = AACensus(reference, mapped_read_collection_arr, genes, self.frames)
def setup(self): bam = TEST_PATH + "/data/align.bam" reference = TEST_PATH + "/data/hxb2_pol.fas" genes_file = TEST_PATH + "/data/hxb2_pol.bed" error_rate = 0.0038 rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] # Create a MappedReadCollection object for r in rs: mapped_read_collection_arr.append( parse_mapped_reads_from_bam(r, bam)) variants = NTVariantCollection.from_mapped_read_collections( error_rate, rs, *mapped_read_collection_arr) variants.filter('q30', 'QUAL<30', True) variants.filter('ac5', 'AC<5', True) variants.filter('dp100', 'DP<100', True) # Mask the unconfident differences for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants) # Parse the genes from the gene file genes = parse_genes_file(genes_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]['frame']) aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) self.codon_variants = CodonVariantCollection.from_aacensus(aa_census)
def __init__(self, id, output_dir, reads, reference, BED4_file, mutation_db, quiet, consensus_pct): self.id = id self.output_dir = output_dir self.reads = reads self.reference = reference self.mutation_db = mutation_db self.BED4_file = BED4_file self.quiet = quiet self.consensus_pct = consensus_pct self.input_size = 0 self.determine_input_size() self.references = parse_references_from_fasta(self.reference) self.genes = parse_BED4_file(BED4_file, self.references[0].name) self.quality = QualityControl() if not os.path.isdir(output_dir): os.mkdir(output_dir) self.filtered_reads_dir = "%s/filtered.fastq" % output_dir
def test_valid_vcf_file(self): """Tests to ensure that valid vcf files are parsed properly.""" reference = TEST_PATH + \ "/data/hxb2_pol.fas" bam = TEST_PATH + "/data/align.bam" rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # Create a MappedReadCollection object mapped_read_collection_arr.append( parse_mapped_reads_from_bam(r, bam)) variants_obj = NTVariantCollection(rs) for i in range(0, 20): variant = NTVariant(chrom="hxb2_pol", pos=i, id=".", ref='a', alt='t', qual="50", filter="PASS", info={ "DP": "300", "AC": "1", "AF": "0.0025" }) variants_obj.variants["hxb2_pol"][i]['t'] = variant #Create a valid vcf file valid_vcf_file = TEST_PATH + "/data/valid_vcf_file.vcf" with open(valid_vcf_file, "w+") as f: f.write( "##fileformat=VCFv4.2\n" "##fileDate=20171005\n" "##source=quasitools\n" "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n" "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele Count\">\n" "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">\n" "##FILTER=<ID=q30,Description=\"Quality below 30\">\n" "##FILTER=<ID=dp100,Description=\"Read depth below 100\">\n" "##FILTER=<ID=ac5,Description=\"Allele count below 5\">\n" "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") for rid in variants_obj.variants: for pos in variants_obj.variants[rid]: for alt in variants_obj.variants[rid][pos]: variant = variants_obj.variants[rid][pos][alt] f.write("\n%s\t%i\t%s\t%s\t%s\t%s\t%s" % (variant.chrom, int( variant.pos), variant.id, variant.ref, variant.alt, variant.qual, variant.filter)) f.write( "\tDP=%i;AC=%i;AF=%0.4f" % (int(variant.info["DP"]), int(variant.info["AC"]), float(variant.info["AF"]))) parsed_nt_var = parse_nt_variants_from_vcf(valid_vcf_file, rs) # Check equality of parsed NTVariantCollection vs. the valid NTVariantCollection for rid in parsed_nt_var.variants: for pos in parsed_nt_var.variants[rid]: for alt in parsed_nt_var.variants[rid][pos]: parsed_variant = parsed_nt_var.variants[rid][pos][alt] variant = variants_obj.variants[rid][pos][alt] assert parsed_variant.chrom == variant.chrom assert parsed_variant.pos == variant.pos assert parsed_variant.id == variant.id assert parsed_variant.ref == variant.ref assert parsed_variant.alt == variant.alt assert parsed_variant.qual == variant.qual assert parsed_variant.filter == variant.filter assert parsed_variant.info["DP"] == variant.info["DP"] assert parsed_variant.info["AC"] == variant.info["AC"] assert parsed_variant.info["AF"] == variant.info["AF"] os.remove(valid_vcf_file)
def setup_class(self): self.bam1 = TEST_PATH + '/data/quasi1.bam' self.bam2 = TEST_PATH + '/data/quasi2.bam' self.test_cp_files = (self.bam1, self.bam2) self.test_cp_ref = TEST_PATH + '/data/hxb2_pol.fas' self.references = parse_references_from_fasta(self.test_cp_ref)
def dist(ctx, reference, bam, normalize, output_distance, startpos, endpos, output, no_coverage): """ dist - Performs the main part of the program INPUT: [CONTEXT] [ctx] [FASTA FILE LOCATION] [reference] [BAM FILE LOCATION] [bam] [BOOL] [normalize/dont_normalize] [BOOL] [output_distance/output_similarity] [INT] [startpos] [INT] [endpos] [STRING] [output] Output the CSV-formatted matrix output in a file instead of in the terminal. [STRING] [truncate/remove_no_coverage/keep_no_coverage] Options to truncate low-coverage regions on the ends of the pileup, ignore all low coverage regions, or keep all low coverage regions RETURN: None. POST: The distance matrix is printed out unless an error message was raised. """ if len(bam) < 2: raise click.UsageError("At least two bam file locations are required" + " to perform quasispecies distance comparison") # indicate if the start or end position is < 0 or a priori invalid if type(startpos) == int and int(startpos) < 1: raise click.UsageError("Start position must be >= 1.") if type(endpos) == int and int(endpos) < 1: raise click.UsageError("End position must be >= 1.") if (type(startpos) == int and type(endpos) == int and (startpos > endpos)): raise click.UsageError("Start position must be <= end position") # Build the reference object. references = parse_references_from_fasta(reference) pileups = Pileup_List.construct_pileup_list(bam, references) if startpos is None: startpos = 1 if endpos is None: endpos = pileups.get_pileup_length() if pileups.get_pileup_length() == 0: raise click.UsageError("Empty pileup was produced from BAM files." + "Halting program") click.echo("The start position is %d." % startpos) click.echo("The end position is %d." % endpos) click.echo("Constructed pileup from reference.") # click.echo the number of positions in pileup click.echo("The pileup covers %d positions before modifications." % pileups.get_pileup_length()) # indicate whether the user-specified start and end position is out # of bounds (comparing to actual number of positions in pileup) if startpos > pileups.get_pileup_length(): raise click.UsageError("Start position must be less than or" + " equal to the number of nucleotide base " + "positions in pileup (%s)." % pileups.get_pileup_length()) if endpos > pileups.get_pileup_length(): raise click.UsageError("End position must be less than or equal to " + "the number of nucleotide base positions in " + "pileup (%s)." % pileups.get_pileup_length()) # we convert the start and end positions from one-based indexing to # zero-based indexing which is expected by distance.py and pileup.py startpos -= 1 endpos -= 1 # if there is no errors so far, proceed with running program modified = modify_pileups(ctx, normalize, startpos, endpos, no_coverage, pileups) if (no_coverage is not 'keep_no_coverage') and (len(modified) == 0): raise click.UsageError("Entire pileup was truncated due to " + "lack of coverage. Halting program") dist = DistanceMatrix(modified, bam) if output_distance: click.echo("Outputting an angular cosine distance matrix.") if output: output.write(dist.get_distance_matrix_as_csv()) else: click.echo(dist.get_distance_matrix_as_csv()) else: click.echo("Outputting a cosine similarity matrix.") if output: output.write(dist.get_similarity_matrix_as_csv()) else: click.echo(dist.get_similarity_matrix_as_csv())
def aavar(bam, reference, bed4_file, variants, mutation_db, min_freq, error_rate, output): rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # Create a MappedReadCollection object mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam)) if variants: variants_obj = parse_nt_variants_from_vcf(variants, rs) else: variants = NTVariantCollection.from_mapped_read_collections( error_rate, rs, *mapped_read_collection_arr) variants.filter('q30', 'QUAL<30', True) variants.filter('ac5', 'AC<5', True) variants.filter('dp100', 'DP<100', True) variants_obj = variants # Mask the unconfident differences for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants_obj) # Parse the genes from the gene file genes = parse_BED4_file(bed4_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]['frame']) # Create an AACensus object aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) # Create AAVar collection and print the aavf file aa_vars = AAVariantCollection.from_aacensus(aa_census) # Filter for mutant frequency aa_vars.filter('mf0.01', 'freq<0.01', True) # Build the mutation database and update collection if mutation_db is not None: mutation_db = MutationDB(mutation_db, genes) aa_vars.apply_mutation_db(mutation_db) aavf_obj = aa_vars.to_aavf_obj("aavar", os.path.basename(reference), CONFIDENT) records = list(aavf_obj) if output: writer = parser.Writer(output, aavf_obj) else: writer = parser.Writer(sys.stdout, aavf_obj) for record in records: writer.write_record(record) if output: output.close writer.close()
def bam(reference_location, bam_location, k, haplotype_filter, output_location): ''' Reports the per-amplicon (fasta) or k-mer complexity of the pileup, for each k-mer position in the reference complexity (bam and reference) of a quasispecies using several measures outlined in the following work: Gregori, Josep, et al. "Viral quasispecies complexity measures." Virology 493 (2016): 227-237. ''' """ # ======================================================================== BAM COMPLEXITY PURPOSE ------- Create a report of k-mer complexity of the pileup, for each k-mer position in the reference. INPUT ----- [(BAM) FILE LOCATION] [bam_location] The file location of a bam file. [(REFERENCE) FILE LOCATION] [reference_location] The file location of the reference file. [INT] k Provides the sequence length for our reads from a given starting position. [FLOAT] haplotype_filter: User defined filter between 0 and 100, haplotypes under the filter size will be removed from each positional list. Default is set to 0 (i.e it will not filter). [(OUTPUT) FILE LOCATION] [output_location] The location of the output file. RETURN ------ [NONE] POST ---- The complexity computation will be completed and the results will be stored in CSV file or std.out. # ======================================================================== """ k = int(k) references = parse_references_from_fasta(reference_location) # A list where each position contains a list of haplotypes of length k # starting at that position in the reference. haplotype_list = parse_haplotypes_from_bam(references, reference_location, bam_location, k) measurements_list = [] for i in range(len(haplotype_list)): haplotypes = haplotype_list[i] # Remove haplotypes below threshold. # Get total number of haplotypes for each position. total_haplotypes = haplotype.calculate_total_clones(haplotypes) # Add haplotypes within threshold to new haplotypes list haplotypes_within_filter = [ hap for hap in haplotypes if (float(hap.count) / float(total_haplotypes) * 100) >= haplotype_filter ] measurements = measure_complexity(haplotypes_within_filter) measurements_list.append(measurements) # if the output_location is specificed open it as complexit_file, if not # specified, complexity_file is set as sys.stdout. with open(output_location, 'w') if output_location else sys.stdout as \ complexity_file: measurement_to_csv(measurements_list, complexity_file)
def test_valid_csv_file(self): """Tests to make sure that a valid codon variant csv file is properly parsed into a CodonVariantCollection object. """ reference = TEST_PATH + "/data/hxb2_pol.fas" rs = parse_references_from_fasta(reference) var_obj = CodonVariantCollection(rs) for i in range(0, 30): variant = CodonVariant(chrom="hxb2_pol", pos=i, gene="gag", nt_start_gene=1309 + i, nt_end_gene=2841 + i, nt_start=2077 + i, nt_end=2079 + i, ref_codon="ata", mutant_codon="aAa", ref_aa="I", mutant_aa="K", coverage=563 + i, mutant_freq=1.60 + i, mutant_type="S", ns_count=1.0000, s_count=1.5000) pos = int(variant.nt_start) - int(variant.nt_start_gene) var_obj.variants["gag"][pos]["aAa"] = variant valid_csv = TEST_PATH + "/data/valid_csv.csv" with open(valid_csv, "w+") as f: f.write("#gene,nt position (gene),nt start position," "nt end position,ref codon,mutant codon,ref AA,mutant AA," "coverage,mutant frequency,mutant type,NS count,S count") for gene in var_obj.variants: for pos in var_obj.variants[gene]: for codon in var_obj.variants[gene][pos]: variant = var_obj.variants[gene][pos][codon] f.write( "%s,%i-%i,%i,%i,%s,%s,%s,%s,%i,%.2f,%s,%0.4f,%0.4f\n" % (variant.gene, variant.nt_start_gene, variant.nt_end_gene, variant.nt_start, variant.nt_end, variant.ref_codon, variant.mutant_codon, variant.ref_aa, variant.mutant_aa, variant.coverage, variant.mutant_freq, variant.mutant_type, variant.ns_count, variant.s_count)) parsed_codon_variants = parse_codon_variants(valid_csv, rs) for gene in parsed_codon_variants.variants: for pos in parsed_codon_variants.variants[gene]: for codon in parsed_codon_variants.variants[gene][pos]: parsed_variant = parsed_codon_variants.variants[gene][pos][ codon] variant = var_obj.variants[gene][pos][codon] assert parsed_variant.chrom == variant.chrom assert parsed_variant.nt_start_gene == variant.nt_start_gene assert parsed_variant.nt_end_gene == variant.nt_end_gene assert parsed_variant.nt_start == variant.nt_start assert parsed_variant.nt_end == variant.nt_end assert parsed_variant.ref_codon == variant.ref_codon assert parsed_variant.mutant_codon == variant.mutant_codon assert parsed_variant.ref_aa == variant.ref_aa assert parsed_variant.mutant_aa == variant.mutant_aa assert parsed_variant.coverage == variant.coverage assert parsed_variant.mutant_freq == variant.mutant_freq assert parsed_variant.mutant_type == variant.mutant_type assert parsed_variant.ns_count == variant.ns_count assert parsed_variant.s_count == variant.s_count os.remove(valid_csv)