def select_pileup_range_and_truncate_output( self, pileup, startpos, endpos, pileup_select_range_expected_out, pileup_truncate_expected_out): """ select_pileup_range_and_truncate_output - Checks if the program works as expected when truncating contiguous start and end regions after first selecting a specified range. INPUT: [2D ARRAY of DICTIONARIES] [pileup] [INT] [startpos] [INT] [endpos] [2D ARRAY OF DICTIONARIES] [pileup_select_range_expected_out] [2D ARRAY OF DICTIONARIES] [pileup_truncate_expected_out] RETURN: TODO POST: TODO """ pileups = Pileup_List([Pileup(bam) for bam in pileup]) pileups.select_pileup_range(startpos, endpos) select_pileup = pileups.get_pileups_as_array() # assert that the pileup positions before startpos and after endpos # have been ignored assert select_pileup == pileup_select_range_expected_out pileups.truncate_output() truncated_pileup = pileups.get_pileups_as_array() # assert that the pileup is truncated now as expected assert truncated_pileup == pileup_truncate_expected_out
def test_remove_no_coverage(self, pileup, expected_truncated_pileup, expected_left_pos_truncated, expected_right_pos_truncated): """ test_remove_no_coverage - Checks that the after truncating all empty positions from the pileup that the output is as expected INPUT: [2D ARRAY OF DICTIONARIES] [pileup] # to be truncated [2D ARRAY OF DICTIONARIES] [expected_remove_no_coverage_pileup] [2D ARRAY OF DICTIONARIES] [expected_left_pos_truncated] # number of contiguous left positions that were truncated [2D ARRAY OF DICTIONARIES] [expected_right_pos_truncated] # number of contiguous right positions that were truncated RETURN: [None] POST: Checks that the expected outputs match the actual output """ pileups = Pileup_List([Pileup(bam) for bam in pileup]) pileups.remove_no_coverage() truncated = pileups.get_pileups_as_array() assert truncated == expected_truncated_pileup assert pileups.get_num_left_positions_truncated( ) == expected_left_pos_truncated assert pileups.get_num_right_positions_truncated( ) == expected_right_pos_truncated
def test_select_pileup_range_and_remove_no_coverage( self, pileup, startpos, endpos, expected_remove_no_coverage): """ select_pileup_range_and_truncate_output - Checks if the program works as expected when removing all no coverage regions after first selecting a specified range. INPUT: [2D ARRAY of DICTIONARIES] [pileup] [INT] [startpos] [INT] [endpos] [2D ARRAY OF DICTIONARIES] [expected_remove_no_coverage] RETURN: TODO POST: TODO """ pileups = Pileup_List([Pileup(bam) for bam in pileup]) pileups.select_pileup_range(startpos, endpos) pileups.remove_no_coverage() truncated = pileups.get_pileups_as_array() assert truncated == expected_remove_no_coverage
def test_truncate_output(self, pileup, expected_truncated_pileup, expected_left_pos_truncated, expected_right_pos_truncated): """ test_truncate_output - Checks that the expected truncated outputs matches the actual output. INPUT: [2D ARRAY OF DICTIONARIES] [pileup] # to be truncated [2D ARRAY OF DICTIONARIES] [expected_truncated_pileup] [2D ARRAY OF DICTIONARIES] [expected_left_pos_truncated] # number of contiguous left positions that were truncated [2D ARRAY OF DICTIONARIES] [expected_right_pos_truncated] # number of contiguous right positions that were truncated RETURN: [None] POST: Checks that the expected outputs match the actual output """ pileups = Pileup_List([Pileup(bam) for bam in pileup]) pileups.truncate_output() truncated = pileups.get_pileups_as_array() assert truncated == expected_truncated_pileup assert pileups.get_num_left_positions_truncated( ) == expected_left_pos_truncated assert pileups.get_num_right_positions_truncated( ) == expected_right_pos_truncated
def parse_pileup_list_from_bam(references, file_list): """ # ======================================================================== PARSE PILEUP LIST FROM BAM PURPOSE ------- Constructs a Pileup_List object from Reference objects and multiple BAM files. The Pileup_List will contain multiple Pileup objects, one associated with each BAM file. The Reference objects must be correspond to every BAM file. INPUT ----- [LIST (REFERENCE)] [references] A list of quasitools Reference objects. [LIST (BAM FILE LOCATIONS)] [file_list] A list of BAM file locations, each corresponding to one alignment pileup. All BAM files must each correspond to the same associated References objects. RETURN ------ [Pileup_List] A new Pileup_List object representing a collection of Pileup objects. # ======================================================================== """ pileups = [] for bam_location in file_list: pileup = parse_pileup_from_bam(references, bam_location) pileups.append(pileup) return Pileup_List(pileups)
def matrix(self, request): """ matrix - test fixture for test_get_similarity_matrix function and test_get_distance_matrix function INPUT: [LIST OF TUPLES] request.param[0]---[BOOL] [normalize] # normalized or not request.param[1]---[ARRAY] [pileup list] request.param[2]---[ARRAY] [pileup_files] # file names corresponding to pileups request.param[3]---[ARRAY] normalized or unnormalized similarity csv-format output request.param[4]---[ARRAY] normalized or unnormalized distance csv-format output request.param[5]---[INT or NONE] [startpos or default if NONE] request.param[6]---[INT or NONE] [endpos or default if NONE] RETURN: [DistanceMatrix] [matrix with the pileup to be used] POST: self.expected_csv_distance is now a csv representation of the expected distance that should be calculated from this matrix. self.expected_csv_similarity is now a csv representation of the expected similarity that should be calculated from this matrix. """ pileups = Pileup_List([Pileup(bam) for bam in request.param[1]]) # if startpos is int and endpos is int (aka they are not None) if type(request.param[5]) is int and type(request.param[6]) is int: pileups.select_pileup_range(request.param[5], request.param[6]) # if boolean normalize flag (request.param[0]) is true normalize if request.param[0] is True: pileups.normalize_pileups() # create matrix with pileup dist = DistanceMatrix(pileups.get_pileups_as_numerical_array(), request.param[2]) self.expected_csv_similarity = request.param[3] self.expected_csv_distance = request.param[4] return dist
def test_construct_pileup_list(self): """ test_construct_pileup_list - Checks that the pileup length and the first few indices of the pileup are correct. INPUT: [None] RETURN: [None] POST: [None] """ bamPileup = Pileup_List.construct_pileup_list(self.test_cp_files, self.references) pileup_as_array = bamPileup.get_pileups_as_array() pileup_as_numerical_array = bamPileup.get_pileups_as_numerical_array() assert len(pileup_as_array) == 2 assert len(pileup_as_array[0]) == 2844 assert len(pileup_as_array[1]) == 2844 assert len(pileup_as_numerical_array[0]) == (2844 * 4) assert len(pileup_as_numerical_array[1]) == (2844 * 4) assert pileup_as_array[0][0:10] == [{ 'C': 12 }, { 'C': 12 }, { 'T': 12 }, { 'C': 12 }, { 'G': 2, 'C': 3, 'T': 1, 'A': 6 }, { 'G': 12 }, { 'G': 12 }, { 'T': 12 }, { 'C': 12 }, { 'G': 2, 'C': 3, 'T': 1, 'A': 6 }] assert pileup_as_array[1][0:10] == [{ 'C': 12 }, { 'C': 12 }, { 'T': 12 }, { 'C': 12 }, { 'A': 6, 'C': 5, 'G': 1 }, { 'G': 12 }, { 'A': 4, 'G': 8 }, { 'T': 12 }, { 'C': 12 }, { 'A': 7, 'T': 1, 'C': 3, 'G': 1 }]
def dist(ctx, reference, bam, normalize, output_distance, startpos, endpos, output, no_coverage): """ dist - Performs the main part of the program INPUT: [CONTEXT] [ctx] [FASTA FILE LOCATION] [reference] [BAM FILE LOCATION] [bam] [BOOL] [normalize/dont_normalize] [BOOL] [output_distance/output_similarity] [INT] [startpos] [INT] [endpos] [STRING] [output] Output the CSV-formatted matrix output in a file instead of in the terminal. [STRING] [truncate/remove_no_coverage/keep_no_coverage] Options to truncate low-coverage regions on the ends of the pileup, ignore all low coverage regions, or keep all low coverage regions RETURN: None. POST: The distance matrix is printed out unless an error message was raised. """ if len(bam) < 2: raise click.UsageError("At least two bam file locations are required" + " to perform quasispecies distance comparison") # indicate if the start or end position is < 0 or a priori invalid if type(startpos) == int and int(startpos) < 1: raise click.UsageError("Start position must be >= 1.") if type(endpos) == int and int(endpos) < 1: raise click.UsageError("End position must be >= 1.") if (type(startpos) == int and type(endpos) == int and (startpos > endpos)): raise click.UsageError("Start position must be <= end position") # Build the reference object. references = parse_references_from_fasta(reference) pileups = Pileup_List.construct_pileup_list(bam, references) if startpos is None: startpos = 1 if endpos is None: endpos = pileups.get_pileup_length() if pileups.get_pileup_length() == 0: raise click.UsageError("Empty pileup was produced from BAM files." + "Halting program") click.echo("The start position is %d." % startpos) click.echo("The end position is %d." % endpos) click.echo("Constructed pileup from reference.") # click.echo the number of positions in pileup click.echo("The pileup covers %d positions before modifications." % pileups.get_pileup_length()) # indicate whether the user-specified start and end position is out # of bounds (comparing to actual number of positions in pileup) if startpos > pileups.get_pileup_length(): raise click.UsageError("Start position must be less than or" + " equal to the number of nucleotide base " + "positions in pileup (%s)." % pileups.get_pileup_length()) if endpos > pileups.get_pileup_length(): raise click.UsageError("End position must be less than or equal to " + "the number of nucleotide base positions in " + "pileup (%s)." % pileups.get_pileup_length()) # we convert the start and end positions from one-based indexing to # zero-based indexing which is expected by distance.py and pileup.py startpos -= 1 endpos -= 1 # if there is no errors so far, proceed with running program modified = modify_pileups(ctx, normalize, startpos, endpos, no_coverage, pileups) if (no_coverage is not 'keep_no_coverage') and (len(modified) == 0): raise click.UsageError("Entire pileup was truncated due to " + "lack of coverage. Halting program") dist = DistanceMatrix(modified, bam) if output_distance: click.echo("Outputting an angular cosine distance matrix.") if output: output.write(dist.get_distance_matrix_as_csv()) else: click.echo(dist.get_distance_matrix_as_csv()) else: click.echo("Outputting a cosine similarity matrix.") if output: output.write(dist.get_similarity_matrix_as_csv()) else: click.echo(dist.get_similarity_matrix_as_csv())