def test_nucleotide_search_unaligned_reads_read_count_aligned_subject_coverage(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test for aligned read counts Test with subject coverage filtering """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # turn off subject filtering config.nucleotide_query_coverage_threshold = 0 # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file] = nucleotide.unaligned_reads( cfg.sam_file_unaligned_reads, alignments, unaligned_reads_store, keep_sam=True) # reset subject filtering config.nucleotide_query_coverage_threshold = self.default_nucleotide_query_coverage_threshold # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # check the aligned reads count self.assertEqual(len(alignments.get_hit_list()),cfg.sam_file_unaligned_reads_total_aligned_subject_coverage)
def test_nucleotide_search_unaligned_reads_output_fasta_format(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test output file is of fasta format Test sam file is not removed """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # turn off query/subject filtering config.nucleotide_subject_coverage_threshold = 0 config.nucleotide_query_coverage_threshold = 0 # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file] = nucleotide.unaligned_reads( cfg.sam_file_unaligned_reads, alignments, unaligned_reads_store, keep_sam=True) # reset query/subject filtering config.nucleotide_subject_coverage_threshold = self.default_nucleotide_subject_coverage_threshold config.nucleotide_query_coverage_threshold = self.default_nucleotide_query_coverage_threshold # check for fasta output file format file_format=utilities.determine_file_format(unaligned_reads_file_fasta) self.assertEqual("fasta",file_format) # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file)
def test_nucleotide_search_unaligned_reads_output_blast_format(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test the aligned reads file created is of the blastm8 format """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # turn off query/subject filtering config.nucleotide_subject_coverage_threshold = 0 config.nucleotide_query_coverage_threshold = 0 config.file_basename="TEST" # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file] = nucleotide.unaligned_reads( cfg.sam_file_annotations, alignments, unaligned_reads_store, keep_sam=True) # reset query/subject filtering config.nucleotide_subject_coverage_threshold = self.default_nucleotide_subject_coverage_threshold config.nucleotide_query_coverage_threshold = self.default_nucleotide_query_coverage_threshold # test file is of the blastm8 format file_format=utilities.determine_file_format(reduced_aligned_reads_file) # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) self.assertEqual(file_format,"blastm8")
def test_nucleotide_search_unaligned_reads_annotations_reference(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test the different annotation formats are recognized for reference """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # turn off query/subject filtering config.nucleotide_subject_coverage_threshold = 0 config.nucleotide_query_coverage_threshold = 0 # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file] = nucleotide.unaligned_reads( cfg.sam_file_annotations, alignments, unaligned_reads_store, keep_sam=True) # reset query/subject filtering config.nucleotide_subject_coverage_threshold = self.default_nucleotide_subject_coverage_threshold config.nucleotide_query_coverage_threshold = self.default_nucleotide_query_coverage_threshold # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # two of the hits should be for gene "UniRef50" hits=alignments.hits_for_gene("UniRef50") self.assertEqual(len(hits),2)
def test_nucleotide_search_unaligned_reads_annotations_bug(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test the different annotation formats are recognized for bug """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # turn off query/subject filtering config.nucleotide_subject_coverage_threshold = 0 config.nucleotide_query_coverage_threshold = 0 # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file] = nucleotide.unaligned_reads( cfg.sam_file_annotations, alignments, unaligned_reads_store, keep_sam=True) # reset query/subject filtering config.nucleotide_subject_coverage_threshold = self.default_nucleotide_subject_coverage_threshold config.nucleotide_query_coverage_threshold = self.default_nucleotide_query_coverage_threshold # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # there should be one bug which is unclassified self.assertEqual(alignments.bug_list(),["unclassified"])
def test_translated_search_unaligned_reads_annotations_bug(self): """ Test the unaligned reads and the store alignments Test with a rapsearch2 output file Test the different annotation formats are recognized for bug Test without the coverage filter """ # create a set of alignments alignments = store.Alignments() unaligned_reads_store = store.Reads() # set the coverage threshold to zero so as to not test with filter on current_coverage_threshold = config.translated_subject_coverage_threshold config.translated_subject_coverage_threshold = 0 # load the rapsearch2 output with the unaligned reads function unaligned_file_fasta = translated.unaligned_reads( unaligned_reads_store, cfg.rapsearch_file_annotations, alignments) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # reset the coverage threshold config.translated_subject_coverage_threshold = current_coverage_threshold # there should be one bug name and the other should be unclassified self.assertEqual( sorted(alignments.bug_list()), sorted([ "g__Bacteroides.s__Bacteroides_xylanisolvens", "unclassified" ]))
def test_nucleotide_search_unaligned_reads_read_count_unaligned_minimize_memory_use(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test for unaligned read counts Test with minimize memory use """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads(minimize_memory_use=True) # turn off query/subject filtering config.nucleotide_subject_coverage_threshold = 0 config.nucleotide_query_coverage_threshold = 0 # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file] = nucleotide.unaligned_reads( cfg.sam_file_unaligned_reads, alignments, unaligned_reads_store, keep_sam=True) # reset query/subject filtering config.nucleotide_subject_coverage_threshold = self.default_nucleotide_subject_coverage_threshold config.nucleotide_query_coverage_threshold = self.default_nucleotide_query_coverage_threshold # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # check the unaligned reads count self.assertEqual(unaligned_reads_store.count_reads(),cfg.sam_file_unaligned_reads_total_unaligned)
def test_translated_search_unaligned_reads_annotations_reference(self): """ Test the unaligned reads and the store alignments Test with a rapsearch2 output file Test the different annotation formats are recognized for reference Test without the coverage filter """ # create a set of alignments alignments = store.Alignments() unaligned_reads_store = store.Reads() # set the coverage threshold to zero so as to not test with filter on current_coverage_threshold = config.translated_subject_coverage_threshold config.translated_subject_coverage_threshold = 0 # load the rapsearch2 output with the unaligned reads function unaligned_file_fasta = translated.unaligned_reads( unaligned_reads_store, cfg.rapsearch_file_annotations, alignments) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # reset the coverage threshold config.translated_subject_coverage_threshold = current_coverage_threshold # three of the hits should be for gene "UniRef50" hits = alignments.hits_for_gene("UniRef50") self.assertEqual(len(hits), 3)
def test_translated_search_unaligned_reads_blastm8(self): """ Test the unaligned reads and the store alignments Test with a blastm8-like output file Test with empty reads structure Test that function does not require gene lengths in reference id Test without the coverage filter """ # create a set of alignments alignments = store.Alignments() # set the coverage threshold to zero so as to not test with filter on current_coverage_threshold = config.translated_subject_coverage_threshold config.translated_subject_coverage_threshold = 0 # load the blastm8-like output file_handle = open(cfg.rapsearch2_output_file_without_header) for line in file_handle: if not re.search("^#", line): data = line.strip().split(config.blast_delimiter) referenceid = data[config.blast_reference_index] queryid = data[config.blast_query_index] identity = float(data[config.blast_identity_index]) alignment_length = float( data[config.blast_aligned_length_index]) alignments.add(referenceid, 0, queryid, identity / 100.0 * alignment_length, "unclassified", alignment_length) file_handle.close() alignments_test = store.Alignments() unaligned_reads_store = store.Reads() # load the blastm8-like output with the unaligned reads function unaligned_file_fasta = translated.unaligned_reads( unaligned_reads_store, cfg.rapsearch2_output_file_without_header, alignments_test) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # reset the coverage threshold config.translated_subject_coverage_threshold = current_coverage_threshold # check the values are unchanged self.assertEqual(sorted(alignments.get_hit_list()), sorted(alignments_test.get_hit_list()))
def test_nucleotide_search_unaligned_reads_annotations_gene_length(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test the different annotation formats are recognized for gene length Test the gene length uses the read length from the sam file """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # turn off query/subject filtering config.nucleotide_subject_coverage_threshold = 0 config.nucleotide_query_coverage_threshold = 0 # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file] = nucleotide.unaligned_reads( cfg.sam_file_annotations, alignments, unaligned_reads_store, keep_sam=True) # reset query/subject filtering config.nucleotide_subject_coverage_threshold = self.default_nucleotide_subject_coverage_threshold config.nucleotide_query_coverage_threshold = self.default_nucleotide_query_coverage_threshold # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # there should be 4 hits identified all_hits=alignments.get_hit_list() self.assertEqual(len(all_hits),4) # check for set and default gene lengths read_length = 151 expected_length_uniref50 = (abs(2000 - read_length)+1)/1000.0 expected_length_other = (abs(1000 - read_length)+1)/1000.0 for hit in all_hits: query, bug, reference, score, length = hit if reference == "UniRef50": self.assertEqual(length,expected_length_uniref50) else: self.assertEqual(length,expected_length_other)
def test_translated_search_unaligned_reads_annotations_gene_length(self): """ Test the unaligned reads and the store alignments Test with a rapsearch2 output file Test the different annotation formats are recognized for gene length Test without the coverage filter """ # create a set of alignments alignments = store.Alignments() unaligned_reads_store = store.Reads() # set the coverage threshold to zero so as to not test with filter on current_coverage_threshold = config.translated_subject_coverage_threshold config.translated_subject_coverage_threshold = 0 # load the rapsearch2 output with the unaligned reads function unaligned_file_fasta = translated.unaligned_reads( unaligned_reads_store, cfg.rapsearch_file_annotations, alignments) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # reset the coverage threshold config.translated_subject_coverage_threshold = current_coverage_threshold # there should be 4 hits identified all_hits = alignments.get_hit_list() self.assertEqual(len(all_hits), 4) # check for set and default gene lengths read_length = 50 expected_length_uniref50 = (abs(2000 - read_length) + 1) / 1000.0 expected_length_other = (abs(1000 - read_length) + 1) / 1000.0 # check for set and default gene lengths for hit in all_hits: query, bug, reference, score, length = hit if reference == "UniRef50": self.assertEqual(length, expected_length_uniref50) else: self.assertEqual(length, expected_length_other)
def test_nucleotide_search_unaligned_reads_read_count_aligned_evalue_threshold(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test for aligned read counts Test the evalue threshold does not filter alignments """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # turn off query/subject filtering config.nucleotide_subject_coverage_threshold = 0 config.nucleotide_query_coverage_threshold = 0 # update the evalue threshold to a number less than those for the alignment file original_evalue_threshold=config.evalue_threshold config.evalue_threshold=1e-15 # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file] = nucleotide.unaligned_reads( cfg.sam_file_unaligned_reads, alignments, unaligned_reads_store, keep_sam=True) # reset query/subject filtering config.nucleotide_subject_coverage_threshold = self.default_nucleotide_subject_coverage_threshold config.nucleotide_query_coverage_threshold = self.default_nucleotide_query_coverage_threshold # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # reset the evalue threshold back to the original config.evalue_threshold=original_evalue_threshold # check the aligned reads count (all reads should be aligned even though they do not # meet the threshold as the evalue threshold is not applied for this type of alignment) self.assertEqual(len(alignments.get_hit_list()),cfg.sam_file_unaligned_reads_total_aligned)
def test_nucleotide_search_unaligned_reads_read_count_aligned_identity_threshold(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test for aligned read counts Test the identity threshold does filter alignments """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # turn off query/subject filtering config.nucleotide_subject_coverage_threshold = 0 config.nucleotide_query_coverage_threshold = 0 # update the identity threshold to a number larger than those in the alignments original_identity_threshold=config.identity_threshold config.identity_threshold=101.0 # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file] = nucleotide.unaligned_reads( cfg.sam_file_unaligned_reads, alignments, unaligned_reads_store, keep_sam=True) # reset query/subject filtering config.nucleotide_subject_coverage_threshold = self.default_nucleotide_subject_coverage_threshold config.nucleotide_query_coverage_threshold = self.default_nucleotide_query_coverage_threshold # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # reset the identity threshold back to the original config.identity_threshold=original_identity_threshold # check the aligned reads count (it should be two as both should pass the threshold) self.assertEqual(len(alignments.get_hit_list()),2)
def test_nucleotide_search_unaligned_reads_scores(self): """ Test the unaligned reads and the store alignments Test with a bowtie2/sam output file Test the scores are based on percent identities """ # create a set of alignments alignments=store.Alignments() unaligned_reads_store=store.Reads() # turn off query/subject filtering config.nucleotide_subject_coverage_threshold = 0 config.nucleotide_query_coverage_threshold = 0 # read in the aligned and unaligned reads [unaligned_reads_file_fasta, reduced_aligned_reads_file] = nucleotide.unaligned_reads( cfg.sam_file_annotations, alignments, unaligned_reads_store, keep_sam=True) # reset query/subject filtering config.nucleotide_subject_coverage_threshold = self.default_nucleotide_subject_coverage_threshold config.nucleotide_query_coverage_threshold = self.default_nucleotide_query_coverage_threshold # remove temp files utils.remove_temp_file(unaligned_reads_file_fasta) utils.remove_temp_file(reduced_aligned_reads_file) # there should be 4 hits identified all_hits=alignments.get_hit_list() # check for set and default gene lengths expected_score=math.pow(151.0, config.match_power) for hit in all_hits: query, bug, reference, score, length = hit self.assertEqual(score,expected_score)
def test_translated_search_unaligned_reads_blastm8_coverage_filter(self): """ Test the unaligned reads and the store alignments Test with a blastm8-like output file Test with empty reads structure Test that function does not require gene lengths in reference id Test with the coverage filter Test with query length annotations Test that an alignment with query start larger than query end is not filtered """ # create a set of alignments alignments = store.Alignments() # set the coverage threshold to a small value so as to have some alignments pass current_coverage_threshold = config.translated_subject_coverage_threshold config.translated_subject_coverage_threshold = 0.50 # get the set of allowed proteins allowed_proteins = blastx_coverage.blastx_coverage( cfg.rapsearch2_output_file_without_header_coverage, config.translated_subject_coverage_threshold, alignments, True) # load the blastm8-like output file_handle = open(cfg.rapsearch2_output_file_without_header_coverage) for line in file_handle: if not re.search("^#", line): data = line.strip().split(config.blast_delimiter) referenceid = data[config.blast_reference_index] gene, length, bug = alignments.process_reference_annotation( referenceid) queryid, query_length = utilities.get_length_annotation( data[config.blast_query_index]) identity = float(data[config.blast_identity_index]) alignment_length = float( data[config.blast_aligned_length_index]) if gene in allowed_proteins: alignments.add(gene, length, queryid, identity / 100.0 * alignment_length, bug, alignment_length) file_handle.close() alignments_test = store.Alignments() unaligned_reads_store = store.Reads() # load the blastm8-like output with the unaligned reads function unaligned_file_fasta = translated.unaligned_reads( unaligned_reads_store, cfg.rapsearch2_output_file_without_header_coverage, alignments_test) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # reset the coverage threshold config.translated_subject_coverage_threshold = current_coverage_threshold # check the values are unchanged self.assertEqual(sorted(alignments.get_hit_list()), sorted(alignments_test.get_hit_list()))
def test_translated_search_unaligned_reads_identity_threshold(self): """ Test the unaligned reads function Test with a rapsearch output file Test the identity threshold filtering Test without the coverage filter """ # create a set of alignments alignments = store.Alignments() # set the coverage threshold to zero so as to not test with filter on current_coverage_threshold = config.translated_subject_coverage_threshold config.translated_subject_coverage_threshold = 0 # load the rapsearch output file_handle = open(cfg.rapsearch2_output_file_with_header) original_identity_threshold = config.identity_threshold # set a new threshold that will select 3 of the 5 alignments config.identity_threshold = 60.0 for line in file_handle: if not re.search("^#", line): data = line.strip().split(config.blast_delimiter) referenceid = data[config.blast_reference_index] queryid = data[config.blast_query_index] identity = float(data[config.blast_identity_index]) alignment_length = float( data[config.blast_aligned_length_index]) # only store those alignments with identities that meet threshold if identity > config.identity_threshold: alignments.add(referenceid, 0, queryid, identity / 100.0 * alignment_length, "unclassified", alignment_length) file_handle.close() alignments_test = store.Alignments() unaligned_reads_store = store.Reads() # load the rapsearch output with the unaligned reads function unaligned_file_fasta = translated.unaligned_reads( unaligned_reads_store, cfg.rapsearch2_output_file_with_header, alignments_test) # remove temp file utils.remove_temp_file(unaligned_file_fasta) # reset the coverage threshold config.translated_subject_coverage_threshold = current_coverage_threshold # set the threshold back to the default config.identity_threshold = original_identity_threshold # check the total number of alignments is the same self.assertEqual(len(alignments.get_hit_list()), len(alignments_test.get_hit_list()))