Пример #1
0
def test_merge_targets():
    """Identify cluster for merging"""
    uid1 = "ACCT"
    uid2 = "GGGG"
    uid3 = "AAGG"

    seq1 = ["ACTGTTTGTCTAAGC"] * 2
    qual1 = ['I' * len(seq1[0])] * len(seq1)
    seq2 = ["ACTGTTTTTCTAAGC"] * 5
    qual2 = ['I' * len(seq2[0])] * len(seq2)
    seq3 = ["ACTGTTTTTCTAAGC"] * 2
    qual3 = ['I' * len(seq3[0])] * len(seq3)

    clusters = create_consensus([uid1 + uid1]*len(seq1) + \
                                   [uid2 + uid2]*len(seq2),
                                ['I'*(len(uid1)*2)]*(len(seq1) + len(seq2)),
                                seq1 + seq2, qual1 + qual2)
    seq3 = [
        pseq.SequenceWithQuality(seq, qual) for seq, qual in zip(seq3, qual3)
    ]
    uid = pseq.SequenceWithQuality(uid2 + uid3, 'I' * (len(uid2) + len(uid3)))
    cand = clusters.merge_target(uid, seq3[0], {}, 2)
    assert cand == uid2 + uid2, "%r != %r" % (cand, uid2 + uid2)
    cand = clusters.merge_target(uid, seq3[0], {}, 1)
    assert cand is None, "%r != %r" % (cand, None)
Пример #2
0
def test_consensus_diff():
    """Update sequence diff"""
    suffix = 'A'*45
    id1 = sequence.SequenceWithQuality("AAAA", "IIII")
    seq1 = sequence.SequenceWithQuality("ACTGTTTGTCTAAGC"+suffix, "IIIDIIIIIIIIIII"*4)
    seq2 = sequence.SequenceWithQuality("ACTTTTTGTCTTAGC"+suffix, "IIIIIIIIIDIDIII"*4)
    seq3 = sequence.SequenceWithQuality("ACTTTTTGTGTTAGC"+suffix, "IIIIIIIIIqIDIII"*4)
    consensus = cons.Consensus(id1, seq2)
    success = consensus.update(id1, seq1)

    assert success, "Sequence %r was rejected" % seq1
    success = consensus.update(id1, seq3)

    seq_expect = "ACTTTTTGTGTAAGC"+suffix
    qual_expect = "IIIIIIIIIqIIIII"*4
    diff_expect = {3:{'T':2, 'G':1},
                   11:{'A':1, 'T':2},
                   9:{'C':2, 'G':1}}
    assert success, "Sequence %r was rejected" % seq3
    assert consensus.sequence.sequence == seq_expect, \
           "Failed to update consensus (%s != %s)" % (consensus.sequence.sequence, seq_expect)
    assert consensus.sequence.quality == qual_expect, \
           "Failed to update qualities (%s != %s)" % (consensus.sequence.quality, qual_expect)
    assert consensus.diffs == diff_expect, \
           "Incorrect sequence diff (%r != %r)" % (consensus.diffs, diff_expect)
Пример #3
0
def create_consensus(uids, uid_qual, seqs, seq_qual):
    """Create consensus dictionary from raw sequences.

    Args:
        uids (:obj:`list`): UID sequences.
        seqs (:obj:`list`): Read sequences.

    Returns:
        :obj:`dict`: Consensus sequences.
    """
    uid_with_qual = [
        sequence.SequenceWithQuality(seq, qual)
        for seq, qual in zip(uids, uid_qual)
    ]
    seq_with_qual = [
        sequence.SequenceWithQuality(seq, qual)
        for seq, qual in zip(seqs, seq_qual)
    ]
    cluster = {}
    for uid, seq in zip(uid_with_qual, seq_with_qual):
        if uid.sequence not in cluster:
            cluster[uid.sequence] = cons.Consensus(uid, seq)
        else:
            cluster[uid.sequence].update(uid, seq)
    return clust.Clustering(cluster)
Пример #4
0
def test_consensus_new():
    """Create objects of class Consensus"""
    seq = sequence.SequenceWithQuality("ACTGTTTGTCTAAGC", "IIIDIIIIIIIIIII")
    id_seq = sequence.SequenceWithQuality("AAA", "III")
    consensus = cons.Consensus(id_seq, seq)
    assert consensus.sequence == seq
    assert consensus.uid == id_seq
    assert consensus.size == 1
Пример #5
0
def test_update_uid(qual1, qual2, expect):
    """Retain highest quality"""
    id1 = sequence.SequenceWithQuality("A"*len(qual1), qual1)
    id2 = sequence.SequenceWithQuality("A"*len(qual2), qual2)
    seq = sequence.SequenceWithQuality("A"*20, "I"*20)
    consensus = cons.Consensus(id1, seq)
    consensus._update_uid(id2)
    assert consensus.uid.quality == expect, \
           "Failed to retain high quality sequence (%r != %r)" % (consensus.uid.quality, expect)
Пример #6
0
def test_merge_fail_uid():
    """Don't merge sequences with very different UIDs'"""
    id1 = sequence.SequenceWithQuality("AAAA", "IIII")
    id2 = sequence.SequenceWithQuality("CCAA", "IIII")
    seq = sequence.SequenceWithQuality("ACTGTTTGTCTAAGC", "IIIDIIIIIIIIIII")
    cons1 = cons.Consensus(id1, seq)
    cons2 = cons.Consensus(id2, seq)
    merged = cons1.merge(cons2, 1)
    assert not merged, "Merging succeeded unecpectedly"
Пример #7
0
def test_consensus_skip():
    """Reject sequences that are too different"""
    uid = sequence.SequenceWithQuality("AAA", "III")
    seq1 = sequence.SequenceWithQuality("ACTGTTTGTCTAAGC", "IIIDIIIIIIIIIII")
    seq2 = sequence.SequenceWithQuality("TTCTCCCTGGTAAGC", "IIIDIIIIIIIIIII")
    consensus = cons.Consensus(uid, seq1)
    success = consensus.update(uid, seq2)
    assert not success
    assert consensus.sequence == seq1, "%r != %r" % (consensus.sequence, seq1)
    assert consensus.different == 1, "Skipped sequence not counted"
Пример #8
0
def test_consensus_idlen():
    """Skip sequences with incompatible IDs"""
    id1 = sequence.SequenceWithQuality("AAAA", "IIII")
    id2 = sequence.SequenceWithQuality("AAAAA", "IIIII")
    seq = sequence.SequenceWithQuality("ACTGTTTGTCTAAGC", "IIIDIIIIIIIIIII")

    consensus = cons.Consensus(id1, seq)
    success = consensus.update(id2, seq)
    assert not success
    assert consensus.uid == id1, "%r != %r" % (consensus.uid, id1)
Пример #9
0
def test_merge_size():
    """Update size of merged clusters"""
    id1 = sequence.SequenceWithQuality("AAAA", "IIII")
    id2 = sequence.SequenceWithQuality("AACA", "IIII")
    seq = sequence.SequenceWithQuality("ACTGTTTGTCTAAGC", "IIIDIIIIIIIIIII")
    cons1 = cons.Consensus(id1, seq)
    cons1.update(id1, seq)
    cons2 = cons.Consensus(id2, seq)
    cons2.update(id2, seq)
    merged = cons1.merge(cons2, 1)
    assert merged, "Merging failed unexpectedly"
    assert cons1.size == 4, "Incorrect size for merged cluster (%d != %d)" % (cons1.size, 4)
Пример #10
0
def test_merge_simple():
    """Combine two consensus sequences"""
    id1 = sequence.SequenceWithQuality("AAAA", "IIII")
    id2 = sequence.SequenceWithQuality("AACA", "IIII")
    seq = sequence.SequenceWithQuality("ACTGTTTGTCTAAGC", "IIIDIIIIIIIIIII")
    cons1 = cons.Consensus(id1, seq)
    cons2 = cons.Consensus(id2, seq)
    merged = cons1.merge(cons2, 1)
    assert merged, "Merging failed unexpectedly"
    assert cons1.size == 2, "Incorrect size for merged cluster (%d != %d)" % (cons1.size, 2)
    assert cons1.sequence.sequence == seq.sequence, "Incorrect merged sequence (%r != %r)" % \
                                           (cons1.sequence.sequence, seq.sequence)
Пример #11
0
def test_consensus_str():
    """String representation of consensus sequences"""
    suffix = 'A'*45
    id1 = sequence.SequenceWithQuality("AAAA", "IIII")
    seq1 = sequence.SequenceWithQuality("ACTGTTTGTCTAAGC"+suffix, "IIIDIIIIIIIIIII"*4, name='test')
    seq2 = sequence.SequenceWithQuality("ACTTTTTGTCTTAGC"+suffix, "IIIIIIIIIDIDIII"*4, name='test')
    consensus = cons.Consensus(id1, seq1)
    expect_str1 = "@test:AAAA:IIII:1:0:0:0\nACTGTTTGTCTAAGC"+suffix+"\n+\n"+"IIIDIIIIIIIIIII"*4
    expect_repr1 = "Consensus(uid=SequenceWithQuality(sequence='AAAA', " + \
                                                     "quality='IIII', name=''), " + \
                   "sequence=SequenceWithQuality(sequence='ACTGTTTGTCTAAGC" + suffix +"', " + \
                                                "quality='" + 'IIIDIIIIIIIIIII'*4 + \
                                                "', name='test'), " + \
                   "diffs={}, size=1)"
    expect_str2 = "@test:AAAA:IIII:2:0:0:0\nACTTTTTGTCTAAGC" + suffix + \
                  "\n+4G1T1 12A1T1\n" + "IIIIIIIIIIIIIII"*4

    assert str(consensus) == expect_str1, "\n%s\n!=\n%s" % (consensus, expect_str1)
    assert repr(consensus) == expect_repr1, "\n%r\n!=\n%r" % (consensus, expect_repr1)
    consensus.update(id1, seq2)
    assert str(consensus) == expect_str2, "\n%s\n!=\n%s" % (str(consensus), expect_str2)
Пример #12
0
def test_consensus_seqlen():
    """Skip shorter sequences"""
    id1 = sequence.SequenceWithQuality("AAAA", "IIII")
    seq1 = sequence.SequenceWithQuality("AACTGTGAGTGTAGATGTTCTGTA", "I"*24)
    seq2 = sequence.SequenceWithQuality("AACTGTGAGTGTAGATGTTC", "I"*20)
    consensus = cons.Consensus(id1, seq1)
    success = consensus.update(id1, seq2)
    assert not success
    assert consensus.sequence == seq1, "%r != %r" % (consensus.sequence, seq1)
    assert consensus.shorter == 1, "Skipped sequence not recorded"

    consensus = cons.Consensus(id1, seq2)
    success = consensus.update(id1, seq1)
    assert not success
    assert consensus.sequence == seq1, "%r != %r" % (consensus.sequence, seq1)
    assert consensus.shorter == 1, "Skipped sequence not recorded"

    consensus = cons.Consensus(id1, seq2)
    success = consensus.update(id1, seq2)
    assert success
    success = consensus.update(id1, seq1)
    assert not success
    assert consensus.sequence == seq2, "%r != %r" % (consensus.sequence, seq2)
    assert consensus.longer == 1, "Skipped sequence not recorded"
Пример #13
0
    def from_fastq(cls,
                   input_file,
                   id_length,
                   adapter,
                   threshold=5,
                   prefix=5,
                   read_length=None):
        """Read FASTQ file to generate consensus sequences.

        Args:
            input_file (:obj:`str`): Name of input file.
            id_length (:obj:`int`): Length of UID sequence at beginning/end of read.
            adapter (:obj:`str`): Adapter sequence.
            threshold (:obj:`int`, optional): Maximum number of differences allowed between UIDs.
            prefix (:obj:`int`, optional): Length of UID prefix to use in clustering algorithm.
            read_length (:obj:`int`, optional): Original read length used. If this is set and
                and the logging level is sufficiently high, additional log entries are generated
                to track the number of short and long fragments processed.
        Returns:
            :obj:`dict`: Computed consensus sequences.
        """
        adapt_length = id_length + len(adapter)
        if read_length is not None:
            max_short = read_length - id_length - len(adapter)
        else:
            max_short = 0
        name = os.path.basename(input_file).split('.')[0]

        id_set = pseq.GroupedSequenceStore(id_length * 2,
                                           tag_size=prefix,
                                           max_diff=threshold,
                                           wildcard='N')
        id_map = {}
        seq = cls({}, id_set, read_length=read_length)

        open_fun = utils.smart_open(input_file)
        line_count = 0
        ping_freq = 40000
        if cls._logger.isEnabledFor(
                logging.INFO) and not cls._logger.isEnabledFor(logging.DEBUG):
            ping_freq = ping_freq * 10
        with open_fun(input_file) as fastq:
            for (line_count, line) in enumerate(fastq):
                # print out some stats as we go
                if cls._logger.isEnabledFor(logging.INFO) and line_count > 0 and \
                        (line_count % ping_freq) == 0:
                    seq.log_progress(line_count)
                elif (line_count % 4) == 1:
                    line = line.rstrip("\n")
                    nameid = line[0:id_length] + line[-id_length:]
                    sequence = line[adapt_length:-adapt_length]
                    is_long = len(sequence) > max_short
                    seq.stats['reads'][is_long] += 1
                elif (line_count % 4) == 3:
                    line = line.rstrip("\n")
                    qnameid = line[0:id_length] + line[-id_length:]
                    qsequence = line[adapt_length:-adapt_length]

                    uid = pseq.SequenceWithQuality(nameid, qnameid)
                    read_seq = pseq.SequenceWithQuality(sequence,
                                                        qsequence,
                                                        name=name)
                    ## Look for similar IDs that may be candidates for merging
                    similar_id = None
                    if nameid in id_map:
                        similar_id = id_map[nameid]
                        seq.stats['total_fixed'][is_long] += 1
                        id_matched = False
                    elif nameid not in seq:
                        similar_id = seq.merge_target(uid, read_seq, id_map,
                                                      threshold)
                        id_matched = False
                        if similar_id is not None:
                            seq.stats['total_fixed'][is_long] += 1
                    else:
                        similar_id = nameid
                        id_matched = True
                    if similar_id is not None:
                        success = seq[similar_id].update(uid, read_seq)
                        if success:
                            if not id_matched:
                                seq.stats['total_merged'][is_long] += 1
                            if seq[similar_id].size == 2:
                                seq.stats['single_count'][is_long] -= 1
                        else:
                            seq.stats['total_skipped'][is_long] += 1
                    else:
                        seq.add(uid, read_seq)
                        seq.stats['single_count'][is_long] += 1
                        seq.stats['clusters'][is_long] += 1
        if cls._logger.isEnabledFor(logging.DEBUG) and line_count > 0:
            seq.log_progress(line_count)
        return seq