def test_merge_targets(): """Identify cluster for merging""" uid1 = "ACCT" uid2 = "GGGG" uid3 = "AAGG" seq1 = ["ACTGTTTGTCTAAGC"] * 2 qual1 = ['I' * len(seq1[0])] * len(seq1) seq2 = ["ACTGTTTTTCTAAGC"] * 5 qual2 = ['I' * len(seq2[0])] * len(seq2) seq3 = ["ACTGTTTTTCTAAGC"] * 2 qual3 = ['I' * len(seq3[0])] * len(seq3) clusters = create_consensus([uid1 + uid1]*len(seq1) + \ [uid2 + uid2]*len(seq2), ['I'*(len(uid1)*2)]*(len(seq1) + len(seq2)), seq1 + seq2, qual1 + qual2) seq3 = [ pseq.SequenceWithQuality(seq, qual) for seq, qual in zip(seq3, qual3) ] uid = pseq.SequenceWithQuality(uid2 + uid3, 'I' * (len(uid2) + len(uid3))) cand = clusters.merge_target(uid, seq3[0], {}, 2) assert cand == uid2 + uid2, "%r != %r" % (cand, uid2 + uid2) cand = clusters.merge_target(uid, seq3[0], {}, 1) assert cand is None, "%r != %r" % (cand, None)
def test_consensus_diff(): """Update sequence diff""" suffix = 'A'*45 id1 = sequence.SequenceWithQuality("AAAA", "IIII") seq1 = sequence.SequenceWithQuality("ACTGTTTGTCTAAGC"+suffix, "IIIDIIIIIIIIIII"*4) seq2 = sequence.SequenceWithQuality("ACTTTTTGTCTTAGC"+suffix, "IIIIIIIIIDIDIII"*4) seq3 = sequence.SequenceWithQuality("ACTTTTTGTGTTAGC"+suffix, "IIIIIIIIIqIDIII"*4) consensus = cons.Consensus(id1, seq2) success = consensus.update(id1, seq1) assert success, "Sequence %r was rejected" % seq1 success = consensus.update(id1, seq3) seq_expect = "ACTTTTTGTGTAAGC"+suffix qual_expect = "IIIIIIIIIqIIIII"*4 diff_expect = {3:{'T':2, 'G':1}, 11:{'A':1, 'T':2}, 9:{'C':2, 'G':1}} assert success, "Sequence %r was rejected" % seq3 assert consensus.sequence.sequence == seq_expect, \ "Failed to update consensus (%s != %s)" % (consensus.sequence.sequence, seq_expect) assert consensus.sequence.quality == qual_expect, \ "Failed to update qualities (%s != %s)" % (consensus.sequence.quality, qual_expect) assert consensus.diffs == diff_expect, \ "Incorrect sequence diff (%r != %r)" % (consensus.diffs, diff_expect)
def create_consensus(uids, uid_qual, seqs, seq_qual): """Create consensus dictionary from raw sequences. Args: uids (:obj:`list`): UID sequences. seqs (:obj:`list`): Read sequences. Returns: :obj:`dict`: Consensus sequences. """ uid_with_qual = [ sequence.SequenceWithQuality(seq, qual) for seq, qual in zip(uids, uid_qual) ] seq_with_qual = [ sequence.SequenceWithQuality(seq, qual) for seq, qual in zip(seqs, seq_qual) ] cluster = {} for uid, seq in zip(uid_with_qual, seq_with_qual): if uid.sequence not in cluster: cluster[uid.sequence] = cons.Consensus(uid, seq) else: cluster[uid.sequence].update(uid, seq) return clust.Clustering(cluster)
def test_consensus_new(): """Create objects of class Consensus""" seq = sequence.SequenceWithQuality("ACTGTTTGTCTAAGC", "IIIDIIIIIIIIIII") id_seq = sequence.SequenceWithQuality("AAA", "III") consensus = cons.Consensus(id_seq, seq) assert consensus.sequence == seq assert consensus.uid == id_seq assert consensus.size == 1
def test_update_uid(qual1, qual2, expect): """Retain highest quality""" id1 = sequence.SequenceWithQuality("A"*len(qual1), qual1) id2 = sequence.SequenceWithQuality("A"*len(qual2), qual2) seq = sequence.SequenceWithQuality("A"*20, "I"*20) consensus = cons.Consensus(id1, seq) consensus._update_uid(id2) assert consensus.uid.quality == expect, \ "Failed to retain high quality sequence (%r != %r)" % (consensus.uid.quality, expect)
def test_merge_fail_uid(): """Don't merge sequences with very different UIDs'""" id1 = sequence.SequenceWithQuality("AAAA", "IIII") id2 = sequence.SequenceWithQuality("CCAA", "IIII") seq = sequence.SequenceWithQuality("ACTGTTTGTCTAAGC", "IIIDIIIIIIIIIII") cons1 = cons.Consensus(id1, seq) cons2 = cons.Consensus(id2, seq) merged = cons1.merge(cons2, 1) assert not merged, "Merging succeeded unecpectedly"
def test_consensus_skip(): """Reject sequences that are too different""" uid = sequence.SequenceWithQuality("AAA", "III") seq1 = sequence.SequenceWithQuality("ACTGTTTGTCTAAGC", "IIIDIIIIIIIIIII") seq2 = sequence.SequenceWithQuality("TTCTCCCTGGTAAGC", "IIIDIIIIIIIIIII") consensus = cons.Consensus(uid, seq1) success = consensus.update(uid, seq2) assert not success assert consensus.sequence == seq1, "%r != %r" % (consensus.sequence, seq1) assert consensus.different == 1, "Skipped sequence not counted"
def test_consensus_idlen(): """Skip sequences with incompatible IDs""" id1 = sequence.SequenceWithQuality("AAAA", "IIII") id2 = sequence.SequenceWithQuality("AAAAA", "IIIII") seq = sequence.SequenceWithQuality("ACTGTTTGTCTAAGC", "IIIDIIIIIIIIIII") consensus = cons.Consensus(id1, seq) success = consensus.update(id2, seq) assert not success assert consensus.uid == id1, "%r != %r" % (consensus.uid, id1)
def test_merge_size(): """Update size of merged clusters""" id1 = sequence.SequenceWithQuality("AAAA", "IIII") id2 = sequence.SequenceWithQuality("AACA", "IIII") seq = sequence.SequenceWithQuality("ACTGTTTGTCTAAGC", "IIIDIIIIIIIIIII") cons1 = cons.Consensus(id1, seq) cons1.update(id1, seq) cons2 = cons.Consensus(id2, seq) cons2.update(id2, seq) merged = cons1.merge(cons2, 1) assert merged, "Merging failed unexpectedly" assert cons1.size == 4, "Incorrect size for merged cluster (%d != %d)" % (cons1.size, 4)
def test_merge_simple(): """Combine two consensus sequences""" id1 = sequence.SequenceWithQuality("AAAA", "IIII") id2 = sequence.SequenceWithQuality("AACA", "IIII") seq = sequence.SequenceWithQuality("ACTGTTTGTCTAAGC", "IIIDIIIIIIIIIII") cons1 = cons.Consensus(id1, seq) cons2 = cons.Consensus(id2, seq) merged = cons1.merge(cons2, 1) assert merged, "Merging failed unexpectedly" assert cons1.size == 2, "Incorrect size for merged cluster (%d != %d)" % (cons1.size, 2) assert cons1.sequence.sequence == seq.sequence, "Incorrect merged sequence (%r != %r)" % \ (cons1.sequence.sequence, seq.sequence)
def test_consensus_str(): """String representation of consensus sequences""" suffix = 'A'*45 id1 = sequence.SequenceWithQuality("AAAA", "IIII") seq1 = sequence.SequenceWithQuality("ACTGTTTGTCTAAGC"+suffix, "IIIDIIIIIIIIIII"*4, name='test') seq2 = sequence.SequenceWithQuality("ACTTTTTGTCTTAGC"+suffix, "IIIIIIIIIDIDIII"*4, name='test') consensus = cons.Consensus(id1, seq1) expect_str1 = "@test:AAAA:IIII:1:0:0:0\nACTGTTTGTCTAAGC"+suffix+"\n+\n"+"IIIDIIIIIIIIIII"*4 expect_repr1 = "Consensus(uid=SequenceWithQuality(sequence='AAAA', " + \ "quality='IIII', name=''), " + \ "sequence=SequenceWithQuality(sequence='ACTGTTTGTCTAAGC" + suffix +"', " + \ "quality='" + 'IIIDIIIIIIIIIII'*4 + \ "', name='test'), " + \ "diffs={}, size=1)" expect_str2 = "@test:AAAA:IIII:2:0:0:0\nACTTTTTGTCTAAGC" + suffix + \ "\n+4G1T1 12A1T1\n" + "IIIIIIIIIIIIIII"*4 assert str(consensus) == expect_str1, "\n%s\n!=\n%s" % (consensus, expect_str1) assert repr(consensus) == expect_repr1, "\n%r\n!=\n%r" % (consensus, expect_repr1) consensus.update(id1, seq2) assert str(consensus) == expect_str2, "\n%s\n!=\n%s" % (str(consensus), expect_str2)
def test_consensus_seqlen(): """Skip shorter sequences""" id1 = sequence.SequenceWithQuality("AAAA", "IIII") seq1 = sequence.SequenceWithQuality("AACTGTGAGTGTAGATGTTCTGTA", "I"*24) seq2 = sequence.SequenceWithQuality("AACTGTGAGTGTAGATGTTC", "I"*20) consensus = cons.Consensus(id1, seq1) success = consensus.update(id1, seq2) assert not success assert consensus.sequence == seq1, "%r != %r" % (consensus.sequence, seq1) assert consensus.shorter == 1, "Skipped sequence not recorded" consensus = cons.Consensus(id1, seq2) success = consensus.update(id1, seq1) assert not success assert consensus.sequence == seq1, "%r != %r" % (consensus.sequence, seq1) assert consensus.shorter == 1, "Skipped sequence not recorded" consensus = cons.Consensus(id1, seq2) success = consensus.update(id1, seq2) assert success success = consensus.update(id1, seq1) assert not success assert consensus.sequence == seq2, "%r != %r" % (consensus.sequence, seq2) assert consensus.longer == 1, "Skipped sequence not recorded"
def from_fastq(cls, input_file, id_length, adapter, threshold=5, prefix=5, read_length=None): """Read FASTQ file to generate consensus sequences. Args: input_file (:obj:`str`): Name of input file. id_length (:obj:`int`): Length of UID sequence at beginning/end of read. adapter (:obj:`str`): Adapter sequence. threshold (:obj:`int`, optional): Maximum number of differences allowed between UIDs. prefix (:obj:`int`, optional): Length of UID prefix to use in clustering algorithm. read_length (:obj:`int`, optional): Original read length used. If this is set and and the logging level is sufficiently high, additional log entries are generated to track the number of short and long fragments processed. Returns: :obj:`dict`: Computed consensus sequences. """ adapt_length = id_length + len(adapter) if read_length is not None: max_short = read_length - id_length - len(adapter) else: max_short = 0 name = os.path.basename(input_file).split('.')[0] id_set = pseq.GroupedSequenceStore(id_length * 2, tag_size=prefix, max_diff=threshold, wildcard='N') id_map = {} seq = cls({}, id_set, read_length=read_length) open_fun = utils.smart_open(input_file) line_count = 0 ping_freq = 40000 if cls._logger.isEnabledFor( logging.INFO) and not cls._logger.isEnabledFor(logging.DEBUG): ping_freq = ping_freq * 10 with open_fun(input_file) as fastq: for (line_count, line) in enumerate(fastq): # print out some stats as we go if cls._logger.isEnabledFor(logging.INFO) and line_count > 0 and \ (line_count % ping_freq) == 0: seq.log_progress(line_count) elif (line_count % 4) == 1: line = line.rstrip("\n") nameid = line[0:id_length] + line[-id_length:] sequence = line[adapt_length:-adapt_length] is_long = len(sequence) > max_short seq.stats['reads'][is_long] += 1 elif (line_count % 4) == 3: line = line.rstrip("\n") qnameid = line[0:id_length] + line[-id_length:] qsequence = line[adapt_length:-adapt_length] uid = pseq.SequenceWithQuality(nameid, qnameid) read_seq = pseq.SequenceWithQuality(sequence, qsequence, name=name) ## Look for similar IDs that may be candidates for merging similar_id = None if nameid in id_map: similar_id = id_map[nameid] seq.stats['total_fixed'][is_long] += 1 id_matched = False elif nameid not in seq: similar_id = seq.merge_target(uid, read_seq, id_map, threshold) id_matched = False if similar_id is not None: seq.stats['total_fixed'][is_long] += 1 else: similar_id = nameid id_matched = True if similar_id is not None: success = seq[similar_id].update(uid, read_seq) if success: if not id_matched: seq.stats['total_merged'][is_long] += 1 if seq[similar_id].size == 2: seq.stats['single_count'][is_long] -= 1 else: seq.stats['total_skipped'][is_long] += 1 else: seq.add(uid, read_seq) seq.stats['single_count'][is_long] += 1 seq.stats['clusters'][is_long] += 1 if cls._logger.isEnabledFor(logging.DEBUG) and line_count > 0: seq.log_progress(line_count) return seq