def test_write_and_format_decimals(self): """Test whether writes to file work with specifying a certain number of decimals for the ALT_FREQ field output as expected.""" for num_dec in range(3, 6): reader = parser.Reader(SAMPLE_FILE) aavf_obj = reader.read_records() out = fhandle('sampleoutput4.aavf', "w+") writer = parser.Writer(out, aavf_obj) records = list(aavf_obj) for record in records: writer.write_record(record, decimals=num_dec) out.close() reader1 = parser.Reader(TEST_PATH + '/sampleoutput4.aavf').read_records() reader2 = parser.Reader(SAMPLE_FILE).read_records() writer.close() # each ALT_FREQ field's string should have num_dec + 2 characters # e.g. 0.123 if num_dec is three for left, right in zip(reader1, reader2): assert left.INFO == right.INFO assert left.ALT_FREQ == round(right.ALT_FREQ, num_dec), \ "%s and %s should be the same up to the %dth decimal place" % (left.ALT_FREQ, right.ALT_FREQ, num_dec)
def test_write_to_read_from_stream(self): reader0 = parser.Reader(SAMPLE_FILE) aavf_obj = reader0.read_records() out = StringIO() writer = parser.Writer(out, aavf_obj) for record in aavf_obj: writer.write_record(record) out.seek(0) aavf = parser.Reader(out).read_records() out.close() record_list = [record for record in aavf] assert isinstance(aavf, AAVF) assert aavf.metadata.get("fileformat") == "AAVFv1.0", \ "fileformat should be AAVFv1.0, metadata is %s" % aavf.metadata assert aavf.metadata.get("fileDate") == "20180501", \ "filedate should be 20180501, metadata is %s" % aavf.metadata assert aavf.metadata.get("source") == "myProgramV1.0", \ "source should be myProgramV1.0, metadata is %s" % aavf.metadata assert aavf.metadata.get("reference") == ["hxb2.fas"], \ "reference list should be [hxb2.fas], metadata is %s" % aavf.metadata assert aavf.infos assert aavf.filters assert len(record_list) == 7 # all data lines should be the same as in the sample file for record in record_list: assert isinstance(record, Record)
def test_write(self): """Test whether the INFO section can be written correctly.""" reader = parser.Reader(SAMPLE_FILE) aavf_obj = reader.read_records() out = fhandle('sampleoutput2.aavf', "w+") writer = parser.Writer(out, aavf_obj) records = list(aavf_obj) for record in records: writer.write_record(record) writer.flush() writer.close() sample2 = TEST_PATH + "/sampleoutput2.aavf" # initialize readers for iteration below reader = parser.Reader(SAMPLE_FILE) aavf_obj = reader.read_records() reader2 = parser.Reader(sample2) aavf_obj2 = reader2.read_records() # iterate over sample file input and written output to see if they match for left, right in zip(aavf_obj, aavf_obj2): assert left.INFO == right.INFO, "left.INFO is %s and right.INFO is %s" \ % (left.INFO, right.INFO)
def test_aa_variants_nodb(self): # Same as previous test but for no mutation db aavf_path = TEST_PATH + "/data/output/temp.aavf" aavf_out = open(aavf_path, "w") # Read from file and make sure there are no empty lines with open(VALID_AA_VARIANTS_AAVF, "r") as input: valid_variants = input.read() valid_variants_lines = sorted(filter(None, valid_variants.split("\n"))) # Replace category and surveillance with "."s # Okay because comparisons only done on non "#" lines for i, x in enumerate(valid_variants_lines): tokens = x.split(";") # Change result to be what it would be without a db if len(tokens) > 2: x = x.replace(tokens[-2], "CAT=.", 1) x = x.replace(tokens[-1], "SRVL=.", 1) valid_variants_lines[i] = x # Apply the filter to the collection self.aa_collection.filter('af0.01', 'freq<0.01', True) aavf_obj = self.aa_collection.to_aavf_obj( "test", os.path.basename(self.reference), CONFIDENT) records = list(aavf_obj) writer = parser.Writer(aavf_out, aavf_obj) for record in records: writer.write_record(record) aavf_out.close() with open(aavf_path, "r") as input: aa_variants = input.read() # Make sure it's sorted and has no empty strings aa_variants_lines = sorted(filter(None, aa_variants.split("\n"))) # Check the length assert len(valid_variants_lines) == len(aa_variants_lines) # Make sure all the tokens that need to be there are there for pos in range(0, len(valid_variants_lines)): if valid_variants_lines[pos][0:1] != "#": valid_variants_tokens = \ re.split("[,=;\t]", valid_variants_lines[pos].rstrip()) aa_variants_tokens = re.split("[,=;\t]", aa_variants_lines[pos]) for token in aa_variants_tokens: assert token in valid_variants_tokens
def test_aa_variants(self): aavf_path = TEST_PATH + "/data/output/temp.aavf" aavf_out = open(aavf_path, "w") # Read from file and make sure there are no empty lines with open(VALID_AA_VARIANTS_AAVF, "r") as input: valid_variants = input.read() # Sort and filter so comparison order will be fine afterwards valid_variants_lines = sorted(filter(None, valid_variants.split("\n"))) # Apply the filter to the collection self.aa_collection.filter('af0.01', 'freq<0.01', True) # Do the thing with the mutation_db self.aa_collection.apply_mutation_db(self.mutation_db) aavf_obj = self.aa_collection.to_aavf_obj( "test", os.path.basename(self.reference), CONFIDENT) records = list(aavf_obj) writer = parser.Writer(aavf_out, aavf_obj) for record in records: writer.write_record(record) aavf_out.close() with open(aavf_path, "r") as input: aa_variants = input.read() # Make sure it's sorted and has no empty strings aa_variants_lines = sorted(filter(None, aa_variants.split("\n"))) # Check the length assert len(valid_variants_lines) == len(aa_variants_lines) # Make sure all the tokens that need to be there are there for pos in range(0, len(valid_variants_lines)): if valid_variants_lines[pos][0:1] != "#": valid_variants_tokens = \ re.split("[,=;\t]", valid_variants_lines[pos].rstrip()) aa_variants_tokens = re.split("[,=;\t]", aa_variants_lines[pos]) for token in aa_variants_tokens: assert token in valid_variants_tokens writer.close()
def test_write_to_file(self): """Test whether writes to file work as expected.""" reader = parser.Reader(SAMPLE_FILE) aavf_obj = reader.read_records() out = fhandle('sampleoutput3.aavf', "w+") writer = parser.Writer(out, aavf_obj) records = list(aavf_obj) for record in records: writer.write_record(record) out.close() reader1 = parser.Reader(TEST_PATH + '/sampleoutput3.aavf').read_records() reader2 = parser.Reader(SAMPLE_FILE).read_records() assert len(list(reader1)) == len(list(reader2)) # all data lines should be read from the sample file reader2 = parser.Reader(SAMPLE_FILE).read_records() for left, right in zip(reader1, reader2): assert left.INFO == right.INFO
def test_writer(self): """ Order of INFO fields should be compatible with the order of their definition in the header and undefined fields should be last and in alphabetical order. """ reader = parser.Reader(SAMPLE_FILE) aavf_obj = reader.read_records() out = StringIO() writer = parser.Writer(out, aavf_obj) for record in aavf_obj: writer.write_record(record) out.seek(0) out_str = out.getvalue() out.close() definitions = [] for line in out_str.split('\n'): if line.startswith('##INFO='): definitions.append(line.split('ID=')[1].split(',')[0]) if not line or line.startswith('#'): continue fields = [f.split('=')[0] for f in line.split('\t')[7].split(';')] self._assert_order(definitions, fields)
def annotate_aavf(in_file, hivdb_file, out_file): """Annotate an AAVF input file with drug resistance changes. """ res_genes = XmlAsiTransformer(True).transform(open(hivdb_file, "r")) reader = parser.Reader(in_file) aavf_obj = reader.read_records() aavf_obj.infos["CAT"] = model.Info("CAT", ".", "String", "Drug resistance category", None, None) aavf_obj.infos["DRUG"] = model.Info("DRUG", ".", "String", "Drug reistances", None, None) with open(out_file, "w") as out_handle: writer = parser.Writer(out_handle, aavf_obj) for rec in aavf_obj: if rec.POS not in rec.ALT: rmuts = evaluate_resistance([rec], res_genes) k = (rec.GENE, "%s%s%s" % (rec.REF, rec.POS, rec.ALT[0])) if rmuts.get(k): cats = [] drugs = [] for cat in rmuts.get(k).keys(): cats.append(cat) drugs.extend(rmuts[k][cat]) rec.INFO["CAT"] = cats rec.INFO["DRUG"] = drugs writer.write_record(rec)
def aavar(bam, reference, bed4_file, variants, mutation_db, min_freq, error_rate, output): rs = parse_references_from_fasta(reference) mapped_read_collection_arr = [] for r in rs: # Create a MappedReadCollection object mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam)) if variants: variants_obj = parse_nt_variants_from_vcf(variants, rs) else: variants = NTVariantCollection.from_mapped_read_collections( error_rate, rs, *mapped_read_collection_arr) variants.filter('q30', 'QUAL<30', True) variants.filter('ac5', 'AC<5', True) variants.filter('dp100', 'DP<100', True) variants_obj = variants # Mask the unconfident differences for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants_obj) # Parse the genes from the gene file genes = parse_BED4_file(bed4_file, rs[0].name) # Determine which frames our genes are in frames = set() for gene in genes: frames.add(genes[gene]['frame']) # Create an AACensus object aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames) # Create AAVar collection and print the aavf file aa_vars = AAVariantCollection.from_aacensus(aa_census) # Filter for mutant frequency aa_vars.filter('mf0.01', 'freq<0.01', True) # Build the mutation database and update collection if mutation_db is not None: mutation_db = MutationDB(mutation_db, genes) aa_vars.apply_mutation_db(mutation_db) aavf_obj = aa_vars.to_aavf_obj("aavar", os.path.basename(reference), CONFIDENT) records = list(aavf_obj) if output: writer = parser.Writer(output, aavf_obj) else: writer = parser.Writer(sys.stdout, aavf_obj) for record in records: writer.write_record(record) if output: output.close writer.close()
def analyze_reads(self, fasta_id, variant_filters, reporting_threshold, generate_consensus): # Map reads against reference using bowtietwo if not self.quiet: print("# Mapping reads...") try: bam = self.generate_bam(fasta_id) except Exception as error: raise (error) if not self.quiet: print("# Loading read mappings...") # cmd_consensus if generate_consensus: cons_seq_file = open("%s/consensus.fasta" % self.output_dir, "w+") mapped_read_collection_arr = [] for r in self.references: mrc = parse_mapped_reads_from_bam(r, bam) mapped_read_collection_arr.append(mrc) consensus_seq = mrc.to_consensus(self.consensus_pct) if generate_consensus and len(consensus_seq) > 0: cons_seq_file.write('>{0}_{1}_{2}\n{3}'.format( fasta_id, reporting_threshold, r.name, consensus_seq)) if generate_consensus: cons_seq_file.close() # cmd_callntvar if not self.quiet: print("# Identifying variants...") variants = NTVariantCollection.from_mapped_read_collections( variant_filters[ERROR_RATE], self.references, *mapped_read_collection_arr) variants.filter('q%s' % variant_filters[MIN_VARIANT_QUAL], 'QUAL<%s' % variant_filters[MIN_VARIANT_QUAL], True) variants.filter('ac%s' % variant_filters[MIN_AC], 'AC<%s' % variant_filters[MIN_AC], True) variants.filter('dp%s' % variant_filters[MIN_DP], 'DP<%s' % variant_filters[MIN_DP], True) vcf_file = open("%s/hydra.vcf" % self.output_dir, "w+") vcf_file.write(variants.to_vcf_file()) vcf_file.close() # cmd_aa_census if not self.quiet: print("# Masking filtered variants...") for mrc in mapped_read_collection_arr: mrc.mask_unconfident_differences(variants) if not self.quiet: print("# Building amino acid census...") # Determine which frames our genes are in frames = set() for gene in self.genes: frames.add(self.genes[gene]['frame']) aa_census = AACensus(self.reference, mapped_read_collection_arr, self.genes, frames) coverage_file = open("%s/coverage_file.csv" % self.output_dir, "w+") coverage_file.write(aa_census.coverage(frames)) coverage_file.close() # cmd_aavariants if not self.quiet: print("# Finding amino acid mutations...") # Create AAVar collection and print the aavf file aa_vars = AAVariantCollection.from_aacensus(aa_census) # Filter for mutant frequency aa_vars.filter('mf%s' % variant_filters[MIN_FREQ], 'freq<%s' % variant_filters[MIN_FREQ], True) # Build the mutation database and update collection if self.mutation_db is not None: mutation_db = MutationDB(self.mutation_db, self.genes) aa_vars.apply_mutation_db(mutation_db) aavf_obj = aa_vars.to_aavf_obj("hydra", os.path.basename(self.reference), CONFIDENT) records = list(aavf_obj) mut_report = open("%s/mutation_report.aavf" % self.output_dir, "w+") writer = parser.Writer(mut_report, aavf_obj) for record in records: writer.write_record(record) mut_report.close() writer.close() # cmd_drmutations if not self.quiet: print("# Writing drug resistant mutation report...") dr_report = open("%s/dr_report.csv" % self.output_dir, "w+") dr_report.write( aa_vars.report_dr_mutations(mutation_db, reporting_threshold)) dr_report.close() self.output_stats(mapped_read_collection_arr)