Exemplo n.º 1
0
    def test_create_msa_slice_from_sam_pair_selection(self):
        """
        Tests that the sam_handler.create_msa_slice_from_sam() is iterating through the records
        and selecting the correct records for pairing.

        - Test Missing Mates
        - Test Unmapped Mates
        - Test Mates Mapped to wrong ref
        - Test Pair Mapped to Wrong Ref
        - Test Low Map Qual Threshold:
        - Test Secondary, Chimeric Alignments:


        CIGAR:  Should be tested in sam_record
        - test H, S, X, =, M, P
        -
        """

        ACTUAL_TEST_PAIR_SELECTION_FULL_MSA_FASTA = TEST_DIR + os.sep + os.path.basename(TEST_PAIR_SELECTION_SAM).replace(".sam", ".msa.fasta")

        # Test that the pairs are selected correctly.   We don't care about slices, breadth thresholds or N's or masking stop codons here.
        # But we do care about mapping quality and target references.
        actual_written = sam.sam_handler.create_msa_slice_from_sam(sam_filename=TEST_PAIR_SELECTION_SAM,
                                                                   ref=TEST_PAIR_SELECTION_TARGET_REF,
                                                                   out_fasta_filename=ACTUAL_TEST_PAIR_SELECTION_FULL_MSA_FASTA,
                                                                   mapping_cutoff=MAPQ_CUTOFF,
                                                                   read_qual_cutoff=READ_QUAL_CUTOFF, max_prop_N=1.0,
                                                                   breadth_thresh=0, start_pos=0, end_pos=0,
                                                                   do_insert_wrt_ref=False, do_mask_stop_codon=False)


        self.assertTrue(os.path.exists(ACTUAL_TEST_PAIR_SELECTION_FULL_MSA_FASTA) and os.path.getsize(ACTUAL_TEST_PAIR_SELECTION_FULL_MSA_FASTA) > 0,
                        ACTUAL_TEST_PAIR_SELECTION_FULL_MSA_FASTA + " doesn't exist or is empty")


        diff_line = TestSamHandler.diff_fasta_line(EXPECTED_TEST_PAIR_SELECTION_FULL_MSA_FASTA, ACTUAL_TEST_PAIR_SELECTION_FULL_MSA_FASTA)
        self.assertIsNone(diff_line,
                        "Expected full msa fasta " + EXPECTED_TEST_PAIR_SELECTION_FULL_MSA_FASTA + " different than " +
                        ACTUAL_TEST_PAIR_SELECTION_FULL_MSA_FASTA + ":\n"  + str(diff_line))

        expected_written = Utility.get_total_seq_from_fasta(EXPECTED_TEST_PAIR_SELECTION_FULL_MSA_FASTA)
        self.assertEqual(expected_written, actual_written,
                         "Expect total written seq {} but got {} from {}".format(expected_written, actual_written, ACTUAL_TEST_PAIR_SELECTION_FULL_MSA_FASTA))
Exemplo n.º 2
0
def tabulate_rates(fasttree_output_dir, output_csv_filename, comments):
    """
    Collects all the GTR model rates from all the fasttree logs in a directory and puts them into output_csv_filename.
    ASSUME that multiple sequence aligned file is in the same folder
    :param output_dir:
    :return:
    """
    import fnmatch
    # .../out/RunABC/HIV1B-nef/ABC_S89.HIV1B-nef.msa.1_300.fasttree.log
    with  open(output_csv_filename,'w') as fh_out:

        fh_out.write("#" + comments + "\n")
        #writer = csv.DictWriter(fh_out, fieldnames=["ID","Ref","Window_Start","Window_End","Window_Reads","Non_Gap_Window_Start","Mutation,Rate"])
        fh_out.write("ID,Ref,Window_Start,Window_End,Window_Reads,Non_Gap_Window_Start,Mutation,Rate\n")
        for root, dirs, filenames in os.walk(fasttree_output_dir):
            for fasttree_log in fnmatch.filter(filenames, '*.fasttree.log'):
                fullpath_fasttree_log = os.path.join(root, fasttree_log)
                AC, AG, AT, CG, CT, GT = fasttree.extract_gtr_rates(fullpath_fasttree_log)
                rates = {"AC":AC, "AG":AG, "AT":AT, "CG":CG, "CT":CT, "GT":GT}

                msa_slice_fasta_filename = fullpath_fasttree_log.replace(".fasttree.log", ".fasta")
                # sample_id.ref.msa.window_start_window_end.fasta
                name_split = os.path.basename(msa_slice_fasta_filename).split(".")
                window = name_split[-2]
                ref = name_split[-4]  # TODO:  what if reference has . in it?
                sample_id = ".".join(name_split[0:-4])
                window_start, window_end = window.split("_")
                nongap_window_start = Utility.get_total_nongap_nuc_by_pos(msa_slice_fasta_filename, 0)
                reads = Utility.get_total_seq_from_fasta(msa_slice_fasta_filename)


                for mutation, rate in rates.iteritems():
                    fh_out.write(",".join([sample_id,
                                  ref,
                                  window_start,
                                  window_end,
                                  str(reads),
                                  str(nongap_window_start),
                                  mutation,
                                  str(rate)]) + "\n")
Exemplo n.º 3
0
    def test_create_msa_slice_from_sam_dup(self):
        """
        Tests that the sam_handler.create_msa_slice_from_sam() is iterating through non-duplicate records
        and selecting the correct records for pairing.
        """

        ACTUAL_TEST_PAIR_SELECTION_FULL_MSA_FASTA = TEST_DIR + os.sep + os.path.basename(TEST_PAIR_SELECTION_REMDUP_SAM).replace(".sam", ".msa.fasta")
        ACTUAL_TEST_PAIR_SELECTION_DUP_TSV = TEST_DIR + os.sep + os.path.basename(TEST_PAIR_SELECTION_REMDUP_SAM).replace(".sam", ".tsv")
        # Test that the pairs are selected correctly.   We don't care about slices, breadth thresholds or N's or masking stop codons here.
        # But we do care about mapping quality and target references.
        actual_written = sam.sam_handler.create_msa_slice_from_sam(sam_filename=TEST_PAIR_SELECTION_REMDUP_SAM,
                                                                   ref=TEST_PAIR_SELECTION_TARGET_REF,
                                                                   out_fasta_filename=ACTUAL_TEST_PAIR_SELECTION_FULL_MSA_FASTA,
                                                                   mapping_cutoff=MAPQ_CUTOFF,
                                                                   read_qual_cutoff=READ_QUAL_CUTOFF, max_prop_N=1.0,
                                                                   breadth_thresh=0, start_pos=0, end_pos=0,
                                                                   do_insert_wrt_ref=False, do_mask_stop_codon=False,
                                                                   do_remove_dup=True,
                                                                   out_dup_tsv_filename=ACTUAL_TEST_PAIR_SELECTION_DUP_TSV)


        self.assertTrue(os.path.exists(ACTUAL_TEST_PAIR_SELECTION_DUP_TSV) and os.path.getsize(ACTUAL_TEST_PAIR_SELECTION_DUP_TSV)>0,
                        ACTUAL_TEST_PAIR_SELECTION_DUP_TSV + " doesn't exist or is empty")

        self.assertTrue(os.path.exists(ACTUAL_TEST_PAIR_SELECTION_FULL_MSA_FASTA) and os.path.getsize(ACTUAL_TEST_PAIR_SELECTION_FULL_MSA_FASTA) > 0,
                        ACTUAL_TEST_PAIR_SELECTION_FULL_MSA_FASTA + " doesn't exist or is empty")


        diff_line = TestSamHandler.diff_fasta_line(EXPECTED_TEST_PAIR_SELECTION_REMDUP_FULL_MSA_FASTA, ACTUAL_TEST_PAIR_SELECTION_FULL_MSA_FASTA)
        self.assertIsNone(diff_line,
                        "Expected full msa fasta " + EXPECTED_TEST_PAIR_SELECTION_REMDUP_FULL_MSA_FASTA + " different than " +
                        ACTUAL_TEST_PAIR_SELECTION_FULL_MSA_FASTA + ":\n"  + str(diff_line))

        expected_written = Utility.get_total_seq_from_fasta(EXPECTED_TEST_PAIR_SELECTION_REMDUP_FULL_MSA_FASTA)
        self.assertEqual(expected_written, actual_written,
                         "Expect total written seq {} but got {} from {}".format(expected_written, actual_written, ACTUAL_TEST_PAIR_SELECTION_FULL_MSA_FASTA))
Exemplo n.º 4
0
def create_msa_slice_from_sam(sam_filename, ref, out_fasta_filename, mapping_cutoff, read_qual_cutoff, max_prop_N,
                              breadth_thresh, start_pos=0, end_pos=0, do_insert_wrt_ref=False, do_mask_stop_codon=False,
                              do_remove_dup=False, ref_len=0):
    """
    Parse SAM file contents for sequences aligned to a reference.
    Extracts the portion of the read that fits into the desired slice of the genome.
    For paired-end reads, merges the mates into a single sequence with gaps with respect to the reference.
    Creates a multiple sequence alignment (MSA) for the desired slice.
    Left and right pads the reads according to the positions within the slice.
    Writes the MSA sequences to out_fasta_filename.
    Converts query names so that they are compatible with Newick format in phylogenetic reconstruction by
        converting colons, semicolons, parentheses to underscores.

    NB:  Sam file must be query sorted.
    NB:  Only takes the primary alignment.

    :param str sam_filename: full path to sam file.  Must be queryname sorted.
    :param str ref: name of reference contig to form MSA alignments to.
                    If None, then splits out all alignments to any reference that fit within the desired slice positions.
                    Setting the ref to None is only useful when the reads are aligned to a set of multiple sequence aligned
                    reference contigs, and you don't care which reference the read hits, just that it fits in the slice.
    :param str out_fasta_filename: full path to output multiple sequence aligned fasta file for the sequences in the slice.
    :param int mapping_cutoff:  Ignore alignments with mapping quality lower than the cutoff.
    :param int read_qual_cutoff: Convert bases with quality lower than this cutoff to N unless both mates agree.
    :param float max_prop_N:  Do not output reads with proportion of N higher than the cutoff.  Only counts the bases within the slice.
                                This only makes a difference if the slice is expected to be much wider than the (merged) read length.
    :param float breadth_thresh:  Fraction of the slice that the read must cover with actual bases A, C, G, T.
                                Reads below this threshold are excluded from output.
    :param int start_pos: 1-based start nucleotide start position of slice.  If 0, then uses beginning of ref.
    :param int end_pos: 1-based end nucleotide start position of slice.  If 0, then uses end of ref.
    :param bool do_insert_wrt_ref: whether to exclude insertions to the reference.
                If include insertions, then the insertions will be multiple sequence aligned further by MAFFT.
    :param bool do_mask_stop_codon: whether to mask stop codons with "NNN".
                Most useful when you want to do codon analysis aftwards, as many codon models do not allow stop codons.
                Assumes that the reference starts at the beginning of a codon.
    :param bool do_remove_dup:  whether or not to exclude duplicate sequence.  Only the the merged read with the highest
        sum of quality scores of aligned bases will be written to fasta if it is duplicated.  To be considered a duplicate
        the sequence must have same start coordinate with respect to reference and matching bases, gaps, N's.
    :param int ref_len: length of reference.  If 0, then takes length from sam headers.
    :returns int:  total sequences written to multiple sequence aligned fasta
    :raises : :py:class:`exceptions.ValueError` if sam file is not queryname sorted according to the sam header
    """

    LOGGER.debug("About to slice fasta " + out_fasta_filename + " from " + sam_filename)
    if os.path.exists(out_fasta_filename) and os.path.getsize(out_fasta_filename):
        LOGGER.warn("Found existing Sliced MSA-Fasta " + out_fasta_filename + ". Not regenerating.")
        total_seq = Utility.get_total_seq_from_fasta(out_fasta_filename)
        LOGGER.debug("Done slice fasta " + out_fasta_filename)
        return total_seq


    total_written = 0
    with open(out_fasta_filename, 'w') as out_fasta_fh:
        if do_remove_dup:
            pair_iter = uniq_record_iter(sam_filename=sam_filename, ref=ref,
                  mapping_cutoff=mapping_cutoff, read_qual_cutoff=read_qual_cutoff, is_insert=do_insert_wrt_ref)
        else:
            pair_iter = record_iter(sam_filename=sam_filename, ref=ref, mapping_cutoff=mapping_cutoff, ref_len=ref_len)
        for pair in pair_iter:
            mseq, mqual, stats = pair.get_seq_qual(do_pad_wrt_ref=False, do_pad_wrt_slice=True,
                                                   q_cutoff=read_qual_cutoff,
                                                   slice_start_wrt_ref_1based=start_pos,
                                                   slice_end_wrt_ref_1based=end_pos,
                                                   do_insert_wrt_ref=do_insert_wrt_ref,
                                                   do_mask_stop_codon=do_mask_stop_codon)
            is_written = __write_seq(out_fasta_fh, pair.get_name(), mseq, max_prop_N, breadth_thresh)
            total_written += 1 if is_written else 0

    LOGGER.debug("Done slice fasta " + out_fasta_filename)
    return total_written