예제 #1
0
    def setUp(self):
        """Initialize values to be used in tests
        """
        self.d1 = DNASequence('GATTACA', id="d1")
        self.d2 = DNASequence('TTG', id="d2")
        self.d1_lower = DNASequence('gattaca', id="d1")
        self.d2_lower = DNASequence('ttg', id="d2")
        self.r1 = RNASequence('GAUUACA', id="r1")
        self.r2 = RNASequence('UUG', id="r2")
        self.r3 = RNASequence('U-----UGCC--', id="r3")

        self.i1 = DNASequence('GATXACA', id="i1")

        self.seqs1 = [self.d1, self.d2]
        self.seqs1_lower = [self.d1_lower, self.d2_lower]
        self.seqs2 = [self.r1, self.r2, self.r3]
        self.seqs3 = self.seqs1 + self.seqs2

        self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')]
        self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'),
                        ('r3', 'U-----UGCC--')]
        self.seqs3_t = self.seqs1_t + self.seqs2_t

        self.s1 = SequenceCollection(self.seqs1)
        self.s1_lower = SequenceCollection(self.seqs1_lower)
        self.s2 = SequenceCollection(self.seqs2)
        self.s3 = SequenceCollection(self.seqs3)
        self.empty = SequenceCollection([])

        self.invalid_s1 = SequenceCollection([self.i1])
예제 #2
0
 def test_reverse_complement(self):
     self.assertEqual(self.b1.reverse_complement(), DNASequence("TGTAATC"))
     self.assertEqual(self.b2.reverse_complement(),
                      DNASequence("GGTACCGGT"))
     self.assertRaises(BiologicalSequenceError, self.b3.reverse_complement)
     self.assertEqual(self.b4.reverse_complement(),
                      DNASequence("NVHDBMRSWYK"))
예제 #3
0
 def setUp(self):
     """Setup for Fasta tests."""
     self.strings = ['AAAA', 'CCCC', 'gggg', 'uuuu']
     self.fasta_no_label = '>0\nAAAA\n>1\nCCCC\n>2\ngggg\n>3\nuuuu'
     self.fasta_with_label =\
         '>1st\nAAAA\n>2nd\nCCCC\n>3rd\nGGGG\n>4th\nUUUU'
     self.fasta_with_label_lw2 =\
         '>1st\nAA\nAA\n>2nd\nCC\nCC\n>3rd\nGG\nGG\n>4th\nUU\nUU'
     self.alignment_dict = {
         '1st': 'AAAA',
         '2nd': 'CCCC',
         '3rd': 'GGGG',
         '4th': 'UUUU'
     }
     self.sequence_objects_a = [
         DNASequence('ACTCGAGATC', 'seq1'),
         DNASequence('GGCCT', 'seq2')
     ]
     self.sequence_objects_b = [
         BiologicalSequence('ACTCGAGATC', 'seq1'),
         BiologicalSequence('GGCCT', 'seq2')
     ]
     seqs = [
         DNASequence("ACC--G-GGTA..", id="seq1"),
         DNASequence("TCC--G-GGCA..", id="seqs2")
     ]
     self.alignment = Alignment(seqs)
예제 #4
0
    def __call__(self,
                 seq_path,
                 result_path=None,
                 log_path=None,
                 failure_path=None):
        # load candidate sequences
        seq_file = open(seq_path, 'U')
        candidate_sequences = parse_fasta(seq_file)

        # load template sequences
        template_alignment = []
        template_alignment_fp = self.Params['template_filepath']
        for seq_id, seq in parse_fasta(open(template_alignment_fp)):
            # replace '.' characters with '-' characters
            template_alignment.append((seq_id, seq.replace('.', '-').upper()))
        template_alignment = Alignment.from_fasta_records(template_alignment,
                                                          DNASequence,
                                                          validate=True)

        # initialize_logger
        logger = NastLogger(log_path)

        # get function for pairwise alignment method
        pairwise_alignment_f = pairwise_alignment_methods[
            self.Params['pairwise_alignment_method']]

        pynast_aligned, pynast_failed = pynast_seqs(
            candidate_sequences,
            template_alignment,
            min_pct=self.Params['min_pct'],
            min_len=self.Params['min_len'],
            align_unaligned_seqs_f=pairwise_alignment_f,
            logger=logger,
            temp_dir=get_qiime_temp_dir())

        logger.record(str(self))

        for i, seq in enumerate(pynast_failed):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_failed[i] = skb_seq
        pynast_failed = SequenceCollection(pynast_failed)

        for i, seq in enumerate(pynast_aligned):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_aligned[i] = skb_seq
        pynast_aligned = Alignment(pynast_aligned)

        if failure_path is not None:
            fail_file = open(failure_path, 'w')
            fail_file.write(pynast_failed.to_fasta())
            fail_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            result_file.write(pynast_aligned.to_fasta())
            result_file.close()
            return None
        else:
            return pynast_aligned
예제 #5
0
 def test_nondegenerates_gap_mixed_case(self):
     exp = [
         DNASequence('-A.a'),
         DNASequence('-A.c'),
         DNASequence('-C.a'),
         DNASequence('-C.c')
     ]
     obs = sorted(DNASequence('-M.m').nondegenerates(), key=str)
     self.assertEqual(obs, exp)
예제 #6
0
 def test_nondegenerates_mixed_degens(self):
     exp = [
         DNASequence('AGC'),
         DNASequence('AGT'),
         DNASequence('GGC'),
         DNASequence('GGT')
     ]
     obs = sorted(DNASequence('RGY').nondegenerates(), key=str)
     self.assertEqual(obs, exp)
예제 #7
0
    def test_subalignment(self):
        """subalignment functions as expected
        """
        # keep seqs by ids
        actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'])
        expected = Alignment([self.d1, self.d3])
        self.assertEqual(actual, expected)

        # keep seqs by indices
        actual = self.a1.subalignment(seqs_to_keep=[0, 2])
        expected = Alignment([self.d1, self.d3])
        self.assertEqual(actual, expected)

        # keep seqs by ids (invert)
        actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'],
                                      invert_seqs_to_keep=True)
        expected = Alignment([self.d2])
        self.assertEqual(actual, expected)

        # keep seqs by indices (invert)
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      invert_seqs_to_keep=True)
        expected = Alignment([self.d2])
        self.assertEqual(actual, expected)

        # keep positions
        actual = self.a1.subalignment(positions_to_keep=[0, 2, 3])
        d1 = DNASequence('.AC', id="d1")
        d2 = DNASequence('TAC', id="d2")
        d3 = DNASequence('.AC', id="d3")
        expected = Alignment([d1, d2, d3])
        self.assertEqual(actual, expected)

        # keep positions (invert)
        actual = self.a1.subalignment(positions_to_keep=[0, 2, 3],
                                      invert_positions_to_keep=True)
        d1 = DNASequence('.C-GTTGG..', id="d1")
        d2 = DNASequence('TCGGT-GGCC', id="d2")
        d3 = DNASequence('-C-GTTGC--', id="d3")
        expected = Alignment([d1, d2, d3])
        self.assertEqual(actual, expected)

        # keep seqs and positions
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      positions_to_keep=[0, 2, 3])
        d1 = DNASequence('.AC', id="d1")
        d3 = DNASequence('.AC', id="d3")
        expected = Alignment([d1, d3])
        self.assertEqual(actual, expected)

        # keep seqs and positions (invert)
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      positions_to_keep=[0, 2, 3],
                                      invert_seqs_to_keep=True,
                                      invert_positions_to_keep=True)
        d2 = DNASequence('TCGGT-GGCC', id="d2")
        expected = Alignment([d2])
        self.assertEqual(actual, expected)
예제 #8
0
    def test_validate_lengths(self):
        """
        """
        self.assertTrue(self.a1._validate_lengths())
        self.assertTrue(self.a2._validate_lengths())
        self.assertTrue(self.empty._validate_lengths())

        self.assertTrue(
            Alignment([DNASequence('TTT', id="d1")])._validate_lengths())
        self.assertFalse(
            Alignment(
                [DNASequence('TTT', id="d1"),
                 DNASequence('TT', id="d2")])._validate_lengths())
예제 #9
0
    def test_to_phylip_map_labels(self):
        """to_phylip functions as expected with label mapping
        """
        d1 = DNASequence('..ACC-GTTGG..', id="d1")
        d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        d3 = DNASequence('.-ACC-GTTGC--', id="d3")
        a = Alignment([d1, d2, d3])

        phylip_str, id_map = a.to_phylip(map_labels=True, label_prefix="s")
        self.assertEqual(id_map, {'s1': 'd1', 's3': 'd3', 's2': 'd2'})
        expected = "\n".join([
            "3 13", "s1 ..ACC-GTTGG..", "s2 TTACCGGT-GGCC", "s3 .-ACC-GTTGC--"
        ])
        self.assertEqual(phylip_str, expected)
예제 #10
0
    def test_to_phylip(self):
        """to_phylip functions as expected
        """
        d1 = DNASequence('..ACC-GTTGG..', id="d1")
        d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        d3 = DNASequence('.-ACC-GTTGC--', id="d3")
        a = Alignment([d1, d2, d3])

        phylip_str, id_map = a.to_phylip(map_labels=False)
        self.assertEqual(id_map, {'d1': 'd1', 'd3': 'd3', 'd2': 'd2'})
        expected = "\n".join([
            "3 13", "d1 ..ACC-GTTGG..", "d2 TTACCGGT-GGCC", "d3 .-ACC-GTTGC--"
        ])
        self.assertEqual(phylip_str, expected)
예제 #11
0
 def test_iupac_degenerate_characters(self):
     exp = set([
         'B', 'D', 'H', 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y', 'b', 'd',
         'h', 'k', 'm', 'n', 's', 'r', 'w', 'v', 'y'
     ])
     self.assertEqual(self.b1.iupac_degenerate_characters(), exp)
     self.assertEqual(DNASequence.iupac_degenerate_characters(), exp)
예제 #12
0
 def test_iupac_degeneracies(self):
     exp = {
         'B': set(['C', 'T', 'G']),
         'D': set(['A', 'T', 'G']),
         'H': set(['A', 'C', 'T']),
         'K': set(['T', 'G']),
         'M': set(['A', 'C']),
         'N': set(['A', 'C', 'T', 'G']),
         'S': set(['C', 'G']),
         'R': set(['A', 'G']),
         'W': set(['A', 'T']),
         'V': set(['A', 'C', 'G']),
         'Y': set(['C', 'T']),
         'b': set(['c', 't', 'g']),
         'd': set(['a', 't', 'g']),
         'h': set(['a', 'c', 't']),
         'k': set(['t', 'g']),
         'm': set(['a', 'c']),
         'n': set(['a', 'c', 't', 'g']),
         's': set(['c', 'g']),
         'r': set(['a', 'g']),
         'w': set(['a', 't']),
         'v': set(['a', 'c', 'g']),
         'y': set(['c', 't'])
     }
     self.assertEqual(self.b1.iupac_degeneracies(), exp)
     self.assertEqual(DNASequence.iupac_degeneracies(), exp)
예제 #13
0
    def test_is_valid(self):
        """is_valid functions as expected
        """
        self.assertTrue(self.a1.is_valid())
        self.assertTrue(self.a2.is_valid())
        self.assertTrue(self.empty.is_valid())

        # invalid because of length mismatch
        d1 = DNASequence('..ACC-GTTGG..', id="d1")
        d2 = DNASequence('TTACCGGT-GGC', id="d2")
        self.assertFalse(Alignment([d1, d2]).is_valid())

        # invalid because of invalid charaters
        d1 = DNASequence('..ACC-GTXGG..', id="d1")
        d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        self.assertFalse(Alignment([d1, d2]).is_valid())
예제 #14
0
 def test_iupac_characters(self):
     exp = {
         'A', 'C', 'B', 'D', 'G', 'H', 'K', 'M', 'N', 'S', 'R', 'T', 'W',
         'V', 'Y', 'a', 'c', 'b', 'd', 'g', 'h', 'k', 'm', 'n', 's', 'r',
         't', 'w', 'v', 'y'
     }
     self.assertEqual(self.b1.iupac_characters(), exp)
     self.assertEqual(DNASequence.iupac_characters(), exp)
예제 #15
0
 def test_iupac_characters(self):
     exp = {
         'A', 'C', 'B', 'D', 'G', 'H', 'K', 'M', 'N', 'S', 'R', 'T', 'W',
         'V', 'Y', 'a', 'c', 'b', 'd', 'g', 'h', 'k', 'm', 'n', 's', 'r',
         't', 'w', 'v', 'y'
     }
     self.assertEqual(self.b1.iupac_characters(), exp)
     self.assertEqual(DNASequence.iupac_characters(), exp)
예제 #16
0
    def setUp(self):
        self.d1 = DNASequence('..ACC-GTTGG..', id="d1")
        self.d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        self.d3 = DNASequence('.-ACC-GTTGC--', id="d3")

        self.r1 = RNASequence('UUAU-', id="r1")
        self.r2 = RNASequence('ACGUU', id="r2")

        self.seqs1 = [self.d1, self.d2, self.d3]
        self.seqs2 = [self.r1, self.r2]

        self.seqs1_t = [('d1', '..ACC-GTTGG..'), ('d2', 'TTACCGGT-GGCC'),
                        ('d3', '.-ACC-GTTGC--')]
        self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')]

        self.a1 = Alignment(self.seqs1)
        self.a2 = Alignment(self.seqs2)
        self.empty = Alignment([])
예제 #17
0
    def test_alphabet(self):
        """alphabet property functions as expected"""
        exp = {
            'A', 'C', 'B', 'D', 'G', 'H', 'K', 'M', 'N', 'S', 'R', 'T', 'W',
            'V', 'Y', 'a', 'c', 'b', 'd', 'g', 'h', 'k', 'm', 'n', 's', 'r',
            't', 'w', 'v', 'y'
        }

        self.assertEqual(self.b1.alphabet(), exp)
        self.assertEqual(DNASequence.alphabet(), exp)
예제 #18
0
 def test_complement_map(self):
     exp = {
         '-': '-', '.': '.', 'A': 'T', 'C': 'G', 'B': 'V', 'D': 'H',
         'G': 'C', 'H': 'D', 'K': 'M', 'M': 'K', 'N': 'N', 'S': 'S',
         'R': 'Y', 'T': 'A', 'W': 'W', 'V': 'B', 'Y': 'R', 'a': 't',
         'c': 'g', 'b': 'v', 'd': 'h', 'g': 'c', 'h': 'd', 'k': 'm',
         'm': 'k', 'n': 'n', 's': 's', 'r': 'y', 't': 'a', 'w': 'w',
         'v': 'b', 'y': 'r'
     }
     self.assertEqual(self.b1.complement_map(), exp)
     self.assertEqual(DNASequence.complement_map(), exp)
예제 #19
0
 def test_complement_map(self):
     exp = {
         '-': '-', '.': '.', 'A': 'T', 'C': 'G', 'B': 'V', 'D': 'H',
         'G': 'C', 'H': 'D', 'K': 'M', 'M': 'K', 'N': 'N', 'S': 'S',
         'R': 'Y', 'T': 'A', 'W': 'W', 'V': 'B', 'Y': 'R', 'a': 't',
         'c': 'g', 'b': 'v', 'd': 'h', 'g': 'c', 'h': 'd', 'k': 'm',
         'm': 'k', 'n': 'n', 's': 's', 'r': 'y', 't': 'a', 'w': 'w',
         'v': 'b', 'y': 'r'
     }
     self.assertEqual(self.b1.complement_map(), exp)
     self.assertEqual(DNASequence.complement_map(), exp)
예제 #20
0
def check_dna_chars_primers(header,
                            mapping_data,
                            errors,
                            disable_primer_check=False
                            ):
    """ Checks for valid DNA characters in primer fields

    Also flags empty fields as errors unless flags are passed to suppress
    barcode or primer checks.

    header:  list of header strings
    mapping_data:  list of lists of raw metadata mapping file data
    errors:  list of errors
    disable_primer_check:  If True, disables tests for valid primer sequences.
    """

    valid_dna_chars = DNASequence.iupac_characters()
    valid_dna_chars.add(',')

    # Detect fields directly, in case user does not have fields in proper
    # order in the mapping file (this will generate error separately)
    header_fields_to_check = ["ReversePrimer"]
    if not disable_primer_check:
        header_fields_to_check.append("LinkerPrimerSequence")

    check_indices = []

    for curr_field in range(len(header)):
        if header[curr_field] in header_fields_to_check:
            check_indices.append(curr_field)

    # Correction factor for header being the first line
    correction_ix = 1
    # Check for missing data
    for curr_data in range(len(mapping_data)):
        for curr_ix in check_indices:
            if len(mapping_data[curr_data][curr_ix]) == 0:
                errors.append("Missing expected DNA sequence\t%d,%d" %
                              (curr_data + correction_ix, curr_ix))

    # Check for non-DNA characters
    for curr_data in range(len(mapping_data)):
        for curr_ix in check_indices:
            for curr_nt in mapping_data[curr_data][curr_ix]:
                if curr_nt not in valid_dna_chars:
                    errors.append("Invalid DNA sequence detected: %s\t%d,%d" %
                                  (mapping_data[curr_data][curr_ix],
                                   curr_data + correction_ix, curr_ix))
                    continue

    return errors
예제 #21
0
    def test_majority_consensus(self):
        """majority_consensus functions as expected
        """
        d1 = DNASequence('TTT', id="d1")
        d2 = DNASequence('TT-', id="d2")
        d3 = DNASequence('TC-', id="d3")
        a1 = Alignment([d1, d2, d3])
        self.assertEqual(a1.majority_consensus(), DNASequence('TT-'))

        d1 = DNASequence('T', id="d1")
        d2 = DNASequence('A', id="d2")
        a1 = Alignment([d1, d2])
        self.assertTrue(a1.majority_consensus() in
                        [DNASequence('T'), DNASequence('A')])

        self.assertEqual(self.empty.majority_consensus(), '')
예제 #22
0
def check_dna_chars_primers(header,
                            mapping_data,
                            errors,
                            disable_primer_check=False):
    """ Checks for valid DNA characters in primer fields

    Also flags empty fields as errors unless flags are passed to suppress
    barcode or primer checks.

    header:  list of header strings
    mapping_data:  list of lists of raw metadata mapping file data
    errors:  list of errors
    disable_primer_check:  If True, disables tests for valid primer sequences.
    """

    valid_dna_chars = DNASequence.iupac_characters()
    valid_dna_chars.add(',')

    # Detect fields directly, in case user does not have fields in proper
    # order in the mapping file (this will generate error separately)
    header_fields_to_check = ["ReversePrimer"]
    if not disable_primer_check:
        header_fields_to_check.append("LinkerPrimerSequence")

    check_indices = []

    for curr_field in range(len(header)):
        if header[curr_field] in header_fields_to_check:
            check_indices.append(curr_field)

    # Correction factor for header being the first line
    correction_ix = 1
    # Check for missing data
    for curr_data in range(len(mapping_data)):
        for curr_ix in check_indices:
            if len(mapping_data[curr_data][curr_ix]) == 0:
                errors.append("Missing expected DNA sequence\t%d,%d" %
                              (curr_data + correction_ix, curr_ix))

    # Check for non-DNA characters
    for curr_data in range(len(mapping_data)):
        for curr_ix in check_indices:
            for curr_nt in mapping_data[curr_data][curr_ix]:
                if curr_nt not in valid_dna_chars:
                    errors.append("Invalid DNA sequence detected: %s\t%d,%d" %
                                  (mapping_data[curr_data][curr_ix],
                                   curr_data + correction_ix, curr_ix))
                    continue

    return errors
예제 #23
0
    def test_init_validate(self):
        """initialization with validation functions as expected
        """
        Alignment(self.seqs1, validate=True)

        # invalid DNA character
        invalid_seqs1 = [
            self.d1, self.d2, self.d3,
            DNASequence('.-ACC-GTXGC--', id="i1")
        ]
        self.assertRaises(SequenceCollectionError,
                          Alignment,
                          invalid_seqs1,
                          validate=True)

        # invalid lengths (they're not all equal)
        invalid_seqs2 = [
            self.d1, self.d2, self.d3,
            DNASequence('.-ACC-GTGC--', id="i2")
        ]
        self.assertRaises(SequenceCollectionError,
                          Alignment,
                          invalid_seqs2,
                          validate=True)
예제 #24
0
 def test_iupac_degeneracies(self):
     exp = {
         'B': set(['C', 'T', 'G']), 'D': set(['A', 'T', 'G']),
         'H': set(['A', 'C', 'T']), 'K': set(['T', 'G']),
         'M': set(['A', 'C']), 'N': set(['A', 'C', 'T', 'G']),
         'S': set(['C', 'G']), 'R': set(['A', 'G']), 'W': set(['A', 'T']),
         'V': set(['A', 'C', 'G']), 'Y': set(['C', 'T']),
         'b': set(['c', 't', 'g']), 'd': set(['a', 't', 'g']),
         'h': set(['a', 'c', 't']), 'k': set(['t', 'g']),
         'm': set(['a', 'c']), 'n': set(['a', 'c', 't', 'g']),
         's': set(['c', 'g']), 'r': set(['a', 'g']), 'w': set(['a', 't']),
         'v': set(['a', 'c', 'g']), 'y': set(['c', 't'])
     }
     self.assertEqual(self.b1.iupac_degeneracies(), exp)
     self.assertEqual(DNASequence.iupac_degeneracies(), exp)
예제 #25
0
def check_dna_chars_bcs(header,
                        mapping_data,
                        errors,
                        has_barcodes=True):
    """ Checks for valid DNA characters in barcode field

    Also flags empty fields as errors unless flags are passed to suppress
    barcode or primer checks.

    header:  list of header strings
    mapping_data:  list of lists of raw metadata mapping file data
    errors:  list of errors
    has_barcodes:  If True, will test for perform barcodes test (presence,
     uniqueness, valid IUPAC DNA chars).
    """

    valid_dna_chars = DNASequence.iupac_standard_characters()
    # Detect fields directly, in case user does not have fields in proper
    # order in the mapping file (this will generate error separately)
    header_fields_to_check = []
    if has_barcodes:
        header_fields_to_check.append("BarcodeSequence")

    check_indices = []

    for curr_field in range(len(header)):
        if header[curr_field] in header_fields_to_check:
            check_indices.append(curr_field)

    # Correction factor for header being the first line
    correction_ix = 1
    # Check for missing data
    for curr_data in range(len(mapping_data)):
        for curr_ix in check_indices:
            if len(mapping_data[curr_data][curr_ix]) == 0:
                errors.append("Missing expected DNA sequence\t%d,%d" %
                              (curr_data + correction_ix, curr_ix))
                continue
            for curr_nt in mapping_data[curr_data][curr_ix]:
                if curr_nt not in valid_dna_chars:
                    errors.append("Invalid DNA sequence detected: %s\t%d,%d" %
                                  (mapping_data[curr_data][curr_ix],
                                   curr_data + correction_ix, curr_ix))
                    continue

    return errors
예제 #26
0
 def setUp(self):
     self.empty = DNASequence('')
     self.b1 = DNASequence('GATTACA')
     self.b2 = DNASequence(
         'ACCGGTACC', id="test-seq-2",
         description="A test sequence")
     self.b3 = DNASequence(
         'ACCGGUACC', id="bad-seq-1",
         description="Not a DNA sequence")
     self.b4 = DNASequence(
         'MRWSYKVHDBN', id="degen",
         description="All of the degenerate bases")
     self.b5 = DNASequence('.G--ATTAC-A...')
예제 #27
0
def check_dna_chars_bcs(header, mapping_data, errors, has_barcodes=True):
    """ Checks for valid DNA characters in barcode field

    Also flags empty fields as errors unless flags are passed to suppress
    barcode or primer checks.

    header:  list of header strings
    mapping_data:  list of lists of raw metadata mapping file data
    errors:  list of errors
    has_barcodes:  If True, will test for perform barcodes test (presence,
     uniqueness, valid IUPAC DNA chars).
    """

    valid_dna_chars = DNASequence.iupac_standard_characters()
    # Detect fields directly, in case user does not have fields in proper
    # order in the mapping file (this will generate error separately)
    header_fields_to_check = []
    if has_barcodes:
        header_fields_to_check.append("BarcodeSequence")

    check_indices = []

    for curr_field in range(len(header)):
        if header[curr_field] in header_fields_to_check:
            check_indices.append(curr_field)

    # Correction factor for header being the first line
    correction_ix = 1
    # Check for missing data
    for curr_data in range(len(mapping_data)):
        for curr_ix in check_indices:
            if len(mapping_data[curr_data][curr_ix]) == 0:
                errors.append("Missing expected DNA sequence\t%d,%d" %
                              (curr_data + correction_ix, curr_ix))
                continue
            for curr_nt in mapping_data[curr_data][curr_ix]:
                if curr_nt not in valid_dna_chars:
                    errors.append("Invalid DNA sequence detected: %s\t%d,%d" %
                                  (mapping_data[curr_data][curr_ix],
                                   curr_data + correction_ix, curr_ix))
                    continue

    return errors
예제 #28
0
 def setUp(self):
     """ Initialize values to be used in tests
     """
     self.empty = DNASequence('')
     self.b1 = DNASequence('GATTACA')
     self.b2 = DNASequence(
         'ACCGGTACC', identifier="test-seq-2",
         description="A test sequence")
     self.b3 = DNASequence(
         'ACCGGUACC', identifier="bad-seq-1",
         description="Not a DNA sequence")
     self.b4 = DNASequence(
         'MRWSYKVHDBN', identifier="degen",
         description="All of the degenerate bases")
     self.b5 = DNASequence('.G--ATTAC-A...')
예제 #29
0
 def test_is_reverse_complement(self):
     self.assertFalse(self.b1.is_reverse_complement(self.b1))
     self.assertTrue(self.b1.is_reverse_complement(DNASequence('TGTAATC')))
     self.assertTrue(
         self.b4.is_reverse_complement(DNASequence('NVHDBMRSWYK')))
예제 #30
0
def run_ampliconnoise(mapping_fp,
                      output_dir,
                      command_handler,
                      params,
                      qiime_config,
                      logger=None,
                      status_update_callback=print_to_stdout,
                      chimera_alpha=-3.8228,
                      chimera_beta=0.6200,
                      sff_txt_fp=None,
                      numnodes=2,
                      suppress_perseus=True,
                      output_filepath=None,
                      platform='flx',
                      seqnoise_resolution=None,
                      truncate_len=None):
    """ Run the ampliconnoise pipeline

        The steps performed by this function are:
1. Split input sff.txt file into one file per sample

2. Run scripts required for PyroNoise

3. Run scripts required for SeqNoise

4. Run scripts requred for Perseus (chimera removal)

5. Merge output files into one file similar to the output of split_libraries.py

    output_filepath should be absolute
    seqnoise_resolution should be string
    environment variable PYRO_LOOKUP_FILE must be set correctly. Thus be
    careful passing command handlers that don't spawn child processes, as they
    may not inherit the correct environment variable setting
    """
    map_data, headers, comments = parse_mapping_file(open(mapping_fp, 'U'))
    create_dir(output_dir)

    if seqnoise_resolution is None:
        if platform == 'flx':
            seqnoise_resolution = '30.0'
        elif platform == 'titanium':
            seqnoise_resolution = '25.0'
        else:
            raise RuntimeError('seqnoise_resolution not set, and no' +
                               ' default for platform ' + platform)

    if truncate_len is None:
        if platform == 'flx':
            truncate_len = '220'
        elif platform == 'titanium':
            truncate_len = '400'
        else:
            raise RuntimeError('truncate_len not set, and no' +
                               ' default for platform ' + platform)

    # these are filenames minus extension, and are sample IDs
    sample_names = []
    primer_seqs = []  # same order as sample_names
    bc_seqs = []  # same order as sample_names
    for i in range(len(map_data)):
        sample_names.append(map_data[i][headers.index('SampleID')])
        bc_seqs.append(map_data[i][headers.index('BarcodeSequence')])
        primer = (map_data[i][headers.index('LinkerPrimerSequence')])
        for char, bases in DNASequence.iupac_degeneracies().iteritems():
            primer = primer.replace(char, '[' + ''.join(bases) + ']')
        primer_seqs.append(primer)

    if len(set(primer_seqs)) != 1:
        raise RuntimeError(
            'Error: only one primer per mapping file supported.')
    one_primer = primer_seqs[0]

    commands = []

    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    log_input_md5s(logger, [mapping_fp, sff_txt_fp])

    # execute commands in output_dir
    called_dir = os.getcwd()
    os.chdir(output_dir)
    fh = open(os.path.join(output_dir, 'map.csv'), 'w')
    for i in range(len(sample_names)):
        fh.write(sample_names[i] + ',' + bc_seqs[i] + '\n')
    fh.close()

    # these are the fasta results, e.g. PC.636_Good.fa
    # later we merge them and copy to output file
    post_pyro_tail = '_' + truncate_len
    if suppress_perseus:
        fasta_result_names = [
            sample_name + post_pyro_tail + '_seqnoise_cd.fa'
            for sample_name in sample_names
        ]
    else:
        fasta_result_names = [
            sample_name + '_Good.fa' for sample_name in sample_names
        ]

    cmd = 'cd ' + output_dir  # see also os.chdir above
    commands.append([('change to output dir', cmd)])

    cmd = 'echo $PYRO_LOOKUP_FILE > pyro_lookup_filepath.txt'
    commands.append([('confirm pyro lookup filepath environment variable', cmd)
                     ])

    cmd = 'SplitKeys.pl ' + one_primer + ' map.csv < ' +\
        os.path.join(called_dir, sff_txt_fp) +\
        ' > splitkeys_log.txt 2> unassigned.fna'
    commands.append([('split sff.txt via barcodes (keys)', cmd)])

    for i, sample_name in enumerate(sample_names):

        # Build the summarize taxonomy command
        if platform == 'flx':
            cmd = 'Clean360.pl ' + one_primer + ' ' + sample_name + ' < ' +\
                sample_name + '.raw'
            commands.append([('clean flows ' + sample_name, cmd)])

            # these run through the whole sff file once per sample, I think
            # cmd = "FlowsFA.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
            #     os.path.join(called_dir,sff_txt_fp)
            # commands.append([('extract flows '+sample_name, cmd)])
        elif platform == 'titanium':
            cmd = 'CleanMinMax.pl ' + one_primer + ' ' + sample_name + ' < ' +\
                sample_name + '.raw'
            commands.append([('clean flows ' + sample_name, cmd)])

            # cmd = "FlowsMinMax.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
            #     os.path.join(called_dir,sff_txt_fp)
            # commands.append([('extract flows '+sample_name, cmd)])
        else:
            raise RuntimeError("platform " + platform + " not supported")

        cmd = "mpirun -np " + str(numnodes) + " PyroDist -in " +\
            sample_name + ".dat -out " + \
            sample_name + " > " + sample_name + ".pdout"
        commands.append([('pyrodist ' + sample_name, cmd)])

        cmd = "FCluster -in " + sample_name + ".fdist -out " + sample_name +\
            " > " + sample_name + ".fcout"
        commands.append([('fcluster pyrodist ' + sample_name, cmd)])

        # e.g.:
        # mpirun -np 2 PyroNoise -din PC.354.dat -out PC.354_pyronoise -lin
        # PC.354.list -s 60.0 -c 0.01 > PC.354_pyronoise.pnout
        cmd = "mpirun -np " + str(numnodes) + " PyroNoise -din " +\
            sample_name + ".dat -out " +\
            sample_name + "_pyronoise " + "-lin " +\
            sample_name + ".list -s 60.0 -c 0.01 > " +\
            sample_name + "_pyronoise.pnout"
        commands.append([('pyronoise ' + sample_name, cmd)])

        cmd = 'Parse.pl ' + bc_seqs[i] + one_primer + ' ' + truncate_len + ' < ' +\
            sample_name + '_pyronoise_cd.fa' + ' > ' + sample_name + '_' +\
            truncate_len + '.fa'
        commands.append([('truncate ' + sample_name, cmd)])

        # now start with post_pyro_tail
        cmd = "mpirun -np " + str(numnodes) + " SeqDist -in " +\
            sample_name + post_pyro_tail +\
            ".fa > " + sample_name + post_pyro_tail + ".seqdist"
        commands.append([('seqdist ' + sample_name, cmd)])

        cmd = "FCluster -in " + sample_name + post_pyro_tail + ".seqdist -out " +\
            sample_name + post_pyro_tail + "fcl > " +\
            sample_name + post_pyro_tail + ".fcout"
        commands.append([('fcluster seqdist ' + sample_name, cmd)])

        # e.g.:
        # mpirun -np 2 SeqNoise -in PC.354_pyronoise_cd.fa -din
        # PC.354_pyronoise_cd.seqdist -out PC.354_pyronoise_cd_seqnoise -lin
        # PC.354_pyronoise_cdfcl.list -min PC.354_pyronoise.mapping -s 30.0 -c 0.08 >
        # PC.354_pyronoise_cd.snout

        cmd = "mpirun -np " + str(numnodes) + " SeqNoise -in " +\
            sample_name + post_pyro_tail +\
            ".fa -din " + sample_name + post_pyro_tail + ".seqdist -out " +\
            sample_name + post_pyro_tail +\
            "_seqnoise -lin " + sample_name + post_pyro_tail + 'fcl.list -min ' +\
            sample_name + '_pyronoise' +\
            '.mapping -s ' + seqnoise_resolution + ' -c 0.08 > ' +\
            sample_name + post_pyro_tail + '.snout'
        commands.append([('seqnoise ' + sample_name, cmd)])

        if not suppress_perseus:

            cmd = 'Perseus -sin ' + sample_name + post_pyro_tail +\
                '_seqnoise_cd.fa > ' +\
                sample_name + '.per'
            commands.append([('Perseus ' + sample_name, cmd)])

            cmd = 'Class.pl ' + sample_name + '.per ' +\
                str(chimera_alpha) + ' ' + str(chimera_beta) +\
                ' > ' + sample_name + '.class'
            commands.append([('Class.pl ' + sample_name, cmd)])

            cmd = 'FilterGoodClass.pl ' + sample_name + post_pyro_tail +\
                '_seqnoise_cd.fa ' +\
                sample_name + '.class 0.5 > ' + sample_name + '_Chi.fa 2> ' +\
                sample_name + '_Good.fa'
            commands.append([('FilterGoodClass ' + sample_name, cmd)])

        cmd = 'unweight_fasta.py -i %s -o %s -l %s' %\
            (fasta_result_names[i], sample_name + '_unw.fna', sample_name)
        commands.append([('unweight fasta ' + sample_name, cmd)])

    cmd = 'cat ' +\
        ' '.join([sample_name + '_unw.fna' for sample_name in sample_names]) +\
        ' > ' + output_filepath  # this should be an abs filepath
    commands.append([('cat into one fasta file', cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
예제 #31
0
def run_ampliconnoise(mapping_fp,
                      output_dir, command_handler, params, qiime_config,
                      logger=None, status_update_callback=print_to_stdout,
                      chimera_alpha=-3.8228, chimera_beta=0.6200, sff_txt_fp=None, numnodes=2,
                      suppress_perseus=True, output_filepath=None, platform='flx',
                      seqnoise_resolution=None, truncate_len=None):
    """ Run the ampliconnoise pipeline

        The steps performed by this function are:
1. Split input sff.txt file into one file per sample

2. Run scripts required for PyroNoise

3. Run scripts required for SeqNoise

4. Run scripts requred for Perseus (chimera removal)

5. Merge output files into one file similar to the output of split_libraries.py

    output_filepath should be absolute
    seqnoise_resolution should be string
    environment variable PYRO_LOOKUP_FILE must be set correctly. Thus be
    careful passing command handlers that don't spawn child processes, as they
    may not inherit the correct environment variable setting
    """
    map_data, headers, comments = parse_mapping_file(open(mapping_fp, 'U'))
    create_dir(output_dir)

    if seqnoise_resolution is None:
        if platform == 'flx':
            seqnoise_resolution = '30.0'
        elif platform == 'titanium':
            seqnoise_resolution = '25.0'
        else:
            raise RuntimeError('seqnoise_resolution not set, and no' +
                               ' default for platform ' + platform)

    if truncate_len is None:
        if platform == 'flx':
            truncate_len = '220'
        elif platform == 'titanium':
            truncate_len = '400'
        else:
            raise RuntimeError('truncate_len not set, and no' +
                               ' default for platform ' + platform)

    # these are filenames minus extension, and are sample IDs
    sample_names = []
    primer_seqs = []  # same order as sample_names
    bc_seqs = []  # same order as sample_names
    for i in range(len(map_data)):
        sample_names.append(map_data[i][headers.index('SampleID')])
        bc_seqs.append(map_data[i][headers.index('BarcodeSequence')])
        primer = (map_data[i][headers.index('LinkerPrimerSequence')])
        for char, bases in DNASequence.iupac_degeneracies().iteritems():
            primer = primer.replace(char, '[' + ''.join(bases) + ']')
        primer_seqs.append(primer)

    if len(set(primer_seqs)) != 1:
        raise RuntimeError(
            'Error: only one primer per mapping file supported.')
    one_primer = primer_seqs[0]

    commands = []

    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    log_input_md5s(logger, [mapping_fp, sff_txt_fp])

    # execute commands in output_dir
    called_dir = os.getcwd()
    os.chdir(output_dir)
    fh = open(os.path.join(output_dir, 'map.csv'), 'w')
    for i in range(len(sample_names)):
        fh.write(sample_names[i] + ',' + bc_seqs[i] + '\n')
    fh.close()

    # these are the fasta results, e.g. PC.636_Good.fa
    # later we merge them and copy to output file
    post_pyro_tail = '_' + truncate_len
    if suppress_perseus:
        fasta_result_names = [sample_name + post_pyro_tail + '_seqnoise_cd.fa'
                              for sample_name in sample_names]
    else:
        fasta_result_names = [sample_name + '_Good.fa'
                              for sample_name in sample_names]

    cmd = 'cd ' + output_dir  # see also os.chdir above
    commands.append([('change to output dir', cmd)])

    cmd = 'echo $PYRO_LOOKUP_FILE > pyro_lookup_filepath.txt'
    commands.append([('confirm pyro lookup filepath environment variable',
                      cmd)])

    cmd = 'SplitKeys.pl ' + one_primer + ' map.csv < ' +\
        os.path.join(called_dir, sff_txt_fp) +\
        ' > splitkeys_log.txt 2> unassigned.fna'
    commands.append([('split sff.txt via barcodes (keys)', cmd)])

    for i, sample_name in enumerate(sample_names):

        # Build the summarize taxonomy command
        if platform == 'flx':
            cmd = 'Clean360.pl ' + one_primer + ' ' + sample_name + ' < ' +\
                sample_name + '.raw'
            commands.append([('clean flows ' + sample_name, cmd)])

            # these run through the whole sff file once per sample, I think
            # cmd = "FlowsFA.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
            #     os.path.join(called_dir,sff_txt_fp)
            # commands.append([('extract flows '+sample_name, cmd)])
        elif platform == 'titanium':
            cmd = 'CleanMinMax.pl ' + one_primer + ' ' + sample_name + ' < ' +\
                sample_name + '.raw'
            commands.append([('clean flows ' + sample_name, cmd)])

            # cmd = "FlowsMinMax.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
            #     os.path.join(called_dir,sff_txt_fp)
            # commands.append([('extract flows '+sample_name, cmd)])
        else:
            raise RuntimeError("platform " + platform + " not supported")

        cmd = "mpirun -np " + str(numnodes) + " PyroDist -in " +\
            sample_name + ".dat -out " + \
            sample_name + " > " + sample_name + ".pdout"
        commands.append([('pyrodist ' + sample_name, cmd)])

        cmd = "FCluster -in " + sample_name + ".fdist -out " + sample_name +\
            " > " + sample_name + ".fcout"
        commands.append([('fcluster pyrodist ' + sample_name, cmd)])

# e.g.:
# mpirun -np 2 PyroNoise -din PC.354.dat -out PC.354_pyronoise -lin
# PC.354.list -s 60.0 -c 0.01 > PC.354_pyronoise.pnout
        cmd = "mpirun -np " + str(numnodes) + " PyroNoise -din " +\
            sample_name + ".dat -out " +\
            sample_name + "_pyronoise " + "-lin " +\
            sample_name + ".list -s 60.0 -c 0.01 > " +\
            sample_name + "_pyronoise.pnout"
        commands.append([('pyronoise ' + sample_name, cmd)])

        cmd = 'Parse.pl ' + bc_seqs[i] + one_primer + ' ' + truncate_len + ' < ' +\
            sample_name + '_pyronoise_cd.fa' + ' > ' + sample_name + '_' +\
            truncate_len + '.fa'
        commands.append([('truncate ' + sample_name, cmd)])

        # now start with post_pyro_tail
        cmd = "mpirun -np " + str(numnodes) + " SeqDist -in " +\
            sample_name + post_pyro_tail +\
            ".fa > " + sample_name + post_pyro_tail + ".seqdist"
        commands.append([('seqdist ' + sample_name, cmd)])

        cmd = "FCluster -in " + sample_name + post_pyro_tail + ".seqdist -out " +\
            sample_name + post_pyro_tail + "fcl > " +\
            sample_name + post_pyro_tail + ".fcout"
        commands.append([('fcluster seqdist ' + sample_name, cmd)])

# e.g.:
# mpirun -np 2 SeqNoise -in PC.354_pyronoise_cd.fa -din
# PC.354_pyronoise_cd.seqdist -out PC.354_pyronoise_cd_seqnoise -lin
# PC.354_pyronoise_cdfcl.list -min PC.354_pyronoise.mapping -s 30.0 -c 0.08 >
# PC.354_pyronoise_cd.snout

        cmd = "mpirun -np " + str(numnodes) + " SeqNoise -in " +\
            sample_name + post_pyro_tail +\
            ".fa -din " + sample_name + post_pyro_tail + ".seqdist -out " +\
            sample_name + post_pyro_tail +\
            "_seqnoise -lin " + sample_name + post_pyro_tail + 'fcl.list -min ' +\
            sample_name + '_pyronoise' +\
            '.mapping -s ' + seqnoise_resolution + ' -c 0.08 > ' +\
            sample_name + post_pyro_tail + '.snout'
        commands.append([('seqnoise ' + sample_name, cmd)])

        if not suppress_perseus:

            cmd = 'Perseus -sin ' + sample_name + post_pyro_tail +\
                '_seqnoise_cd.fa > ' +\
                sample_name + '.per'
            commands.append([('Perseus ' + sample_name, cmd)])

            cmd = 'Class.pl ' + sample_name + '.per ' +\
                str(chimera_alpha) + ' ' + str(chimera_beta) +\
                ' > ' + sample_name + '.class'
            commands.append([('Class.pl ' + sample_name, cmd)])

            cmd = 'FilterGoodClass.pl ' + sample_name + post_pyro_tail +\
                '_seqnoise_cd.fa ' +\
                sample_name + '.class 0.5 > ' + sample_name + '_Chi.fa 2> ' +\
                sample_name + '_Good.fa'
            commands.append([('FilterGoodClass ' + sample_name, cmd)])

        cmd = 'unweight_fasta.py -i %s -o %s -l %s' %\
            (fasta_result_names[i], sample_name + '_unw.fna', sample_name)
        commands.append([('unweight fasta ' + sample_name, cmd)])

    cmd = 'cat ' +\
        ' '.join([sample_name + '_unw.fna' for sample_name in sample_names]) +\
        ' > ' + output_filepath  # this should be an abs filepath
    commands.append([('cat into one fasta file', cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
예제 #32
0
 def test_iupac_standard_characters(self):
     """iupac_standard_characters property functions as expected"""
     exp = set("ACGTacgt")
     self.assertEqual(self.b1.iupac_standard_characters(), exp)
     self.assertEqual(DNASequence.iupac_standard_characters(), exp)
예제 #33
0
class DNASequenceTests(TestCase):
    def setUp(self):
        self.empty = DNASequence('')
        self.b1 = DNASequence('GATTACA')
        self.b2 = DNASequence('ACCGGTACC',
                              id="test-seq-2",
                              description="A test sequence")
        self.b3 = DNASequence('ACCGGUACC',
                              id="bad-seq-1",
                              description="Not a DNA sequence")
        self.b4 = DNASequence('MRWSYKVHDBN',
                              id="degen",
                              description="All of the degenerate bases")
        self.b5 = DNASequence('.G--ATTAC-A...')

    def test_alphabet(self):
        exp = {
            'A', 'C', 'B', 'D', 'G', 'H', 'K', 'M', 'N', 'S', 'R', 'T', 'W',
            'V', 'Y', 'a', 'c', 'b', 'd', 'g', 'h', 'k', 'm', 'n', 's', 'r',
            't', 'w', 'v', 'y'
        }

        self.assertEqual(self.b1.alphabet(), exp)
        self.assertEqual(DNASequence.alphabet(), exp)

    def test_gap_alphabet(self):
        self.assertEqual(self.b1.gap_alphabet(), set('-.'))

    def test_complement_map(self):
        exp = {
            '-': '-',
            '.': '.',
            'A': 'T',
            'C': 'G',
            'B': 'V',
            'D': 'H',
            'G': 'C',
            'H': 'D',
            'K': 'M',
            'M': 'K',
            'N': 'N',
            'S': 'S',
            'R': 'Y',
            'T': 'A',
            'W': 'W',
            'V': 'B',
            'Y': 'R',
            'a': 't',
            'c': 'g',
            'b': 'v',
            'd': 'h',
            'g': 'c',
            'h': 'd',
            'k': 'm',
            'm': 'k',
            'n': 'n',
            's': 's',
            'r': 'y',
            't': 'a',
            'w': 'w',
            'v': 'b',
            'y': 'r'
        }
        self.assertEqual(self.b1.complement_map(), exp)
        self.assertEqual(DNASequence.complement_map(), exp)

    def test_iupac_standard_characters(self):
        exp = set("ACGTacgt")
        self.assertEqual(self.b1.iupac_standard_characters(), exp)
        self.assertEqual(DNASequence.iupac_standard_characters(), exp)

    def test_iupac_degeneracies(self):
        exp = {
            'B': set(['C', 'T', 'G']),
            'D': set(['A', 'T', 'G']),
            'H': set(['A', 'C', 'T']),
            'K': set(['T', 'G']),
            'M': set(['A', 'C']),
            'N': set(['A', 'C', 'T', 'G']),
            'S': set(['C', 'G']),
            'R': set(['A', 'G']),
            'W': set(['A', 'T']),
            'V': set(['A', 'C', 'G']),
            'Y': set(['C', 'T']),
            'b': set(['c', 't', 'g']),
            'd': set(['a', 't', 'g']),
            'h': set(['a', 'c', 't']),
            'k': set(['t', 'g']),
            'm': set(['a', 'c']),
            'n': set(['a', 'c', 't', 'g']),
            's': set(['c', 'g']),
            'r': set(['a', 'g']),
            'w': set(['a', 't']),
            'v': set(['a', 'c', 'g']),
            'y': set(['c', 't'])
        }
        self.assertEqual(self.b1.iupac_degeneracies(), exp)
        self.assertEqual(DNASequence.iupac_degeneracies(), exp)

    def test_iupac_degenerate_characters(self):
        exp = set([
            'B', 'D', 'H', 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y', 'b', 'd',
            'h', 'k', 'm', 'n', 's', 'r', 'w', 'v', 'y'
        ])
        self.assertEqual(self.b1.iupac_degenerate_characters(), exp)
        self.assertEqual(DNASequence.iupac_degenerate_characters(), exp)

    def test_iupac_characters(self):
        exp = {
            'A', 'C', 'B', 'D', 'G', 'H', 'K', 'M', 'N', 'S', 'R', 'T', 'W',
            'V', 'Y', 'a', 'c', 'b', 'd', 'g', 'h', 'k', 'm', 'n', 's', 'r',
            't', 'w', 'v', 'y'
        }
        self.assertEqual(self.b1.iupac_characters(), exp)
        self.assertEqual(DNASequence.iupac_characters(), exp)

    def test_complement(self):
        self.assertEqual(self.b1.complement(), DNASequence("CTAATGT"))
        self.assertEqual(self.b2.complement(), DNASequence("TGGCCATGG"))
        self.assertRaises(BiologicalSequenceError, self.b3.complement)
        self.assertEqual(self.b4.complement(), DNASequence("KYWSRMBDHVN"))
        self.assertEqual(self.b5.complement(), DNASequence(".C--TAATG-T..."))

    def test_reverse_complement(self):
        self.assertEqual(self.b1.reverse_complement(), DNASequence("TGTAATC"))
        self.assertEqual(self.b2.reverse_complement(),
                         DNASequence("GGTACCGGT"))
        self.assertRaises(BiologicalSequenceError, self.b3.reverse_complement)
        self.assertEqual(self.b4.reverse_complement(),
                         DNASequence("NVHDBMRSWYK"))

    def test_unsupported_characters(self):
        self.assertEqual(self.b1.unsupported_characters(), set())
        self.assertEqual(self.b2.unsupported_characters(), set())
        self.assertEqual(self.b3.unsupported_characters(), set('U'))
        self.assertEqual(self.b4.unsupported_characters(), set())

    def test_has_unsupported_characters(self):
        self.assertFalse(self.b1.has_unsupported_characters())
        self.assertFalse(self.b2.has_unsupported_characters())
        self.assertTrue(self.b3.has_unsupported_characters())
        self.assertFalse(self.b4.has_unsupported_characters())

    def test_is_reverse_complement(self):
        self.assertFalse(self.b1.is_reverse_complement(self.b1))
        self.assertTrue(self.b1.is_reverse_complement(DNASequence('TGTAATC')))
        self.assertTrue(
            self.b4.is_reverse_complement(DNASequence('NVHDBMRSWYK')))

    def test_nondegenerates_invalid(self):
        with self.assertRaises(BiologicalSequenceError):
            list(DNASequence('AZA').nondegenerates())

    def test_nondegenerates_empty(self):
        self.assertEqual(list(self.empty.nondegenerates()), [self.empty])

    def test_nondegenerates_no_degens(self):
        self.assertEqual(list(self.b1.nondegenerates()), [self.b1])

    def test_nondegenerates_all_degens(self):
        # Same chars.
        exp = [
            DNASequence('CC'),
            DNASequence('CG'),
            DNASequence('GC'),
            DNASequence('GG')
        ]
        # Sort based on sequence string, as order is not guaranteed.
        obs = sorted(DNASequence('SS').nondegenerates(), key=str)
        self.assertEqual(obs, exp)

        # Different chars.
        exp = [
            DNASequence('AC'),
            DNASequence('AG'),
            DNASequence('GC'),
            DNASequence('GG')
        ]
        obs = sorted(DNASequence('RS').nondegenerates(), key=str)
        self.assertEqual(obs, exp)

        # Odd number of chars.
        obs = list(DNASequence('NNN').nondegenerates())
        self.assertEqual(len(obs), 4**3)

    def test_nondegenerates_mixed_degens(self):
        exp = [
            DNASequence('AGC'),
            DNASequence('AGT'),
            DNASequence('GGC'),
            DNASequence('GGT')
        ]
        obs = sorted(DNASequence('RGY').nondegenerates(), key=str)
        self.assertEqual(obs, exp)

    def test_nondegenerates_gap_mixed_case(self):
        exp = [
            DNASequence('-A.a'),
            DNASequence('-A.c'),
            DNASequence('-C.a'),
            DNASequence('-C.c')
        ]
        obs = sorted(DNASequence('-M.m').nondegenerates(), key=str)
        self.assertEqual(obs, exp)
예제 #34
0
class DNASequenceTests(TestCase):

    def setUp(self):
        self.empty = DNASequence('')
        self.b1 = DNASequence('GATTACA')
        self.b2 = DNASequence(
            'ACCGGTACC', id="test-seq-2",
            description="A test sequence")
        self.b3 = DNASequence(
            'ACCGGUACC', id="bad-seq-1",
            description="Not a DNA sequence")
        self.b4 = DNASequence(
            'MRWSYKVHDBN', id="degen",
            description="All of the degenerate bases")
        self.b5 = DNASequence('.G--ATTAC-A...')

    def test_alphabet(self):
        exp = {
            'A', 'C', 'B', 'D', 'G', 'H', 'K', 'M', 'N', 'S', 'R', 'T', 'W',
            'V', 'Y', 'a', 'c', 'b', 'd', 'g', 'h', 'k', 'm', 'n', 's', 'r',
            't', 'w', 'v', 'y'
        }

        self.assertEqual(self.b1.alphabet(), exp)
        self.assertEqual(DNASequence.alphabet(), exp)

    def test_gap_alphabet(self):
        self.assertEqual(self.b1.gap_alphabet(), set('-.'))

    def test_complement_map(self):
        exp = {
            '-': '-', '.': '.', 'A': 'T', 'C': 'G', 'B': 'V', 'D': 'H',
            'G': 'C', 'H': 'D', 'K': 'M', 'M': 'K', 'N': 'N', 'S': 'S',
            'R': 'Y', 'T': 'A', 'W': 'W', 'V': 'B', 'Y': 'R', 'a': 't',
            'c': 'g', 'b': 'v', 'd': 'h', 'g': 'c', 'h': 'd', 'k': 'm',
            'm': 'k', 'n': 'n', 's': 's', 'r': 'y', 't': 'a', 'w': 'w',
            'v': 'b', 'y': 'r'
        }
        self.assertEqual(self.b1.complement_map(), exp)
        self.assertEqual(DNASequence.complement_map(), exp)

    def test_iupac_standard_characters(self):
        exp = set("ACGTacgt")
        self.assertEqual(self.b1.iupac_standard_characters(), exp)
        self.assertEqual(DNASequence.iupac_standard_characters(), exp)

    def test_iupac_degeneracies(self):
        exp = {
            'B': set(['C', 'T', 'G']), 'D': set(['A', 'T', 'G']),
            'H': set(['A', 'C', 'T']), 'K': set(['T', 'G']),
            'M': set(['A', 'C']), 'N': set(['A', 'C', 'T', 'G']),
            'S': set(['C', 'G']), 'R': set(['A', 'G']), 'W': set(['A', 'T']),
            'V': set(['A', 'C', 'G']), 'Y': set(['C', 'T']),
            'b': set(['c', 't', 'g']), 'd': set(['a', 't', 'g']),
            'h': set(['a', 'c', 't']), 'k': set(['t', 'g']),
            'm': set(['a', 'c']), 'n': set(['a', 'c', 't', 'g']),
            's': set(['c', 'g']), 'r': set(['a', 'g']), 'w': set(['a', 't']),
            'v': set(['a', 'c', 'g']), 'y': set(['c', 't'])
        }
        self.assertEqual(self.b1.iupac_degeneracies(), exp)
        self.assertEqual(DNASequence.iupac_degeneracies(), exp)

    def test_iupac_degenerate_characters(self):
        exp = set(['B', 'D', 'H', 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y',
                   'b', 'd', 'h', 'k', 'm', 'n', 's', 'r', 'w', 'v', 'y'])
        self.assertEqual(self.b1.iupac_degenerate_characters(), exp)
        self.assertEqual(DNASequence.iupac_degenerate_characters(), exp)

    def test_iupac_characters(self):
        exp = {
            'A', 'C', 'B', 'D', 'G', 'H', 'K', 'M', 'N', 'S', 'R', 'T', 'W',
            'V', 'Y', 'a', 'c', 'b', 'd', 'g', 'h', 'k', 'm', 'n', 's', 'r',
            't', 'w', 'v', 'y'
        }
        self.assertEqual(self.b1.iupac_characters(), exp)
        self.assertEqual(DNASequence.iupac_characters(), exp)

    def test_complement(self):
        self.assertEqual(self.b1.complement(), DNASequence("CTAATGT"))
        self.assertEqual(self.b2.complement(), DNASequence("TGGCCATGG"))
        self.assertRaises(BiologicalSequenceError, self.b3.complement)
        self.assertEqual(self.b4.complement(), DNASequence("KYWSRMBDHVN"))
        self.assertEqual(self.b5.complement(), DNASequence(".C--TAATG-T..."))

    def test_reverse_complement(self):
        self.assertEqual(self.b1.reverse_complement(), DNASequence("TGTAATC"))
        self.assertEqual(self.b2.reverse_complement(),
                         DNASequence("GGTACCGGT"))
        self.assertRaises(BiologicalSequenceError, self.b3.reverse_complement)
        self.assertEqual(self.b4.reverse_complement(),
                         DNASequence("NVHDBMRSWYK"))

    def test_unsupported_characters(self):
        self.assertEqual(self.b1.unsupported_characters(), set())
        self.assertEqual(self.b2.unsupported_characters(), set())
        self.assertEqual(self.b3.unsupported_characters(), set('U'))
        self.assertEqual(self.b4.unsupported_characters(), set())

    def test_has_unsupported_characters(self):
        self.assertFalse(self.b1.has_unsupported_characters())
        self.assertFalse(self.b2.has_unsupported_characters())
        self.assertTrue(self.b3.has_unsupported_characters())
        self.assertFalse(self.b4.has_unsupported_characters())

    def test_is_reverse_complement(self):
        self.assertFalse(self.b1.is_reverse_complement(self.b1))
        self.assertTrue(
            self.b1.is_reverse_complement(DNASequence('TGTAATC')))
        self.assertTrue(
            self.b4.is_reverse_complement(DNASequence('NVHDBMRSWYK')))

    def test_nondegenerates_invalid(self):
        with self.assertRaises(BiologicalSequenceError):
            list(DNASequence('AZA').nondegenerates())

    def test_nondegenerates_empty(self):
        self.assertEqual(list(self.empty.nondegenerates()), [self.empty])

    def test_nondegenerates_no_degens(self):
        self.assertEqual(list(self.b1.nondegenerates()), [self.b1])

    def test_nondegenerates_all_degens(self):
        # Same chars.
        exp = [DNASequence('CC'), DNASequence('CG'), DNASequence('GC'),
               DNASequence('GG')]
        # Sort based on sequence string, as order is not guaranteed.
        obs = sorted(DNASequence('SS').nondegenerates(), key=str)
        self.assertEqual(obs, exp)

        # Different chars.
        exp = [DNASequence('AC'), DNASequence('AG'), DNASequence('GC'),
               DNASequence('GG')]
        obs = sorted(DNASequence('RS').nondegenerates(), key=str)
        self.assertEqual(obs, exp)

        # Odd number of chars.
        obs = list(DNASequence('NNN').nondegenerates())
        self.assertEqual(len(obs), 4**3)

    def test_nondegenerates_mixed_degens(self):
        exp = [DNASequence('AGC'), DNASequence('AGT'), DNASequence('GGC'),
               DNASequence('GGT')]
        obs = sorted(DNASequence('RGY').nondegenerates(), key=str)
        self.assertEqual(obs, exp)

    def test_nondegenerates_gap_mixed_case(self):
        exp = [DNASequence('-A.a'), DNASequence('-A.c'),
               DNASequence('-C.a'), DNASequence('-C.c')]
        obs = sorted(DNASequence('-M.m').nondegenerates(), key=str)
        self.assertEqual(obs, exp)
예제 #35
0
 def setUp(self):
     self.empty = DNASequence('')
     self.b1 = DNASequence('GATTACA')
     self.b2 = DNASequence('ACCGGTACC',
                           id="test-seq-2",
                           description="A test sequence")
     self.b3 = DNASequence('ACCGGUACC',
                           id="bad-seq-1",
                           description="Not a DNA sequence")
     self.b4 = DNASequence('MRWSYKVHDBN',
                           id="degen",
                           description="All of the degenerate bases")
     self.b5 = DNASequence('.G--ATTAC-A...')
예제 #36
0
 def test_iupac_standard_characters(self):
     exp = set("ACGTacgt")
     self.assertEqual(self.b1.iupac_standard_characters(), exp)
     self.assertEqual(DNASequence.iupac_standard_characters(), exp)
예제 #37
0
 def test_complement(self):
     self.assertEqual(self.b1.complement(), DNASequence("CTAATGT"))
     self.assertEqual(self.b2.complement(), DNASequence("TGGCCATGG"))
     self.assertRaises(BiologicalSequenceError, self.b3.complement)
     self.assertEqual(self.b4.complement(), DNASequence("KYWSRMBDHVN"))
     self.assertEqual(self.b5.complement(), DNASequence(".C--TAATG-T..."))
예제 #38
0
 def test_nondegenerates_invalid(self):
     with self.assertRaises(BiologicalSequenceError):
         list(DNASequence('AZA').nondegenerates())
예제 #39
0
 def test_iupac_standard_characters(self):
     exp = set("ACGTacgt")
     self.assertEqual(self.b1.iupac_standard_characters(), exp)
     self.assertEqual(DNASequence.iupac_standard_characters(), exp)
예제 #40
0
 def test_iupac_degenerate_characters(self):
     exp = set(['B', 'D', 'H', 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y',
                'b', 'd', 'h', 'k', 'm', 'n', 's', 'r', 'w', 'v', 'y'])
     self.assertEqual(self.b1.iupac_degenerate_characters(), exp)
     self.assertEqual(DNASequence.iupac_degenerate_characters(), exp)
예제 #41
0
    def test_nondegenerates_all_degens(self):
        # Same chars.
        exp = [
            DNASequence('CC'),
            DNASequence('CG'),
            DNASequence('GC'),
            DNASequence('GG')
        ]
        # Sort based on sequence string, as order is not guaranteed.
        obs = sorted(DNASequence('SS').nondegenerates(), key=str)
        self.assertEqual(obs, exp)

        # Different chars.
        exp = [
            DNASequence('AC'),
            DNASequence('AG'),
            DNASequence('GC'),
            DNASequence('GG')
        ]
        obs = sorted(DNASequence('RS').nondegenerates(), key=str)
        self.assertEqual(obs, exp)

        # Odd number of chars.
        obs = list(DNASequence('NNN').nondegenerates())
        self.assertEqual(len(obs), 4**3)