/
sequence_clustering.py
executable file
·317 lines (291 loc) · 11.5 KB
/
sequence_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
#!/usr/bin/env python
# encoding: utf-8
"""
De novo identification and quantification of sequence data.
"""
import sys
import numpy as np
import pandas as pd
import os.path as op
from toolshed import nopen, reader
from Bio import trie, triefind
from collections import Counter
from itertools import islice, ifilterfalse
__version__ = "0.4"
def read_fastq(fh):
"""FASTQ parser that yields name, seq, and qual."""
while True:
values = list(islice(fh, 4))
if len(values) == 4:
id1, seq, id2, qual = values
elif len(values) == 0:
raise StopIteration
else:
raise EOFError("unexpected end of file")
assert id1.startswith('@')
assert id2.startswith('+')
assert len(seq) == len(qual)
yield id1[1:-1], seq[:-1], qual[:-1]
def trim_seq(seq, base=5):
"""round length of sequence to nearest `base`"""
return seq[:int(base * round(len(seq)/base))]
def process_exact_fastq(fastq, n):
"""Group identical reads using a Counter. Returns Counter."""
c = Counter()
with nopen(fastq) as fh:
for name, seq, qual in read_fastq(fh):
seq = trim_seq(seq, 4)
if len(seq) < n: continue
c.update([seq])
return c
def process_exact_txt(files, cutoff):
"""returns Counter from multiple quantify runs"""
c = Counter()
for f in files:
for l in reader(f, header=['seq','count']):
if int(l['count']) < cutoff: continue
c.update([l['seq']])
return c
def process_counted(fp, sample_id, cutoff):
"""method called to get sequence counts during `run_matrix`."""
sequence_counts = Counter()
library_size = 0
for l in reader(fp, header=['seq','count']):
count = int(l['count'])
library_size += count
if count < cutoff: continue
sequence_counts[l['seq']] = count
return sequence_counts, library_size
def get_seq_bins(fp):
"""fp to text. no ints on this input."""
c = Counter()
for l in nopen(fp):
c[l.strip()] = 0
return c
def chunker(it, n):
# chunker('AAAABBBC', 4) --> AAAA AAAB AABB ABBB BBBC
return [it[i:i+n] for i in xrange(0, len(it)+1-n, 1)]
def construct_simple_trie(counter):
t = trie.trie()
for seq, count in counter.iteritems():
t[seq] = count
return t
def construct_complex_trie(counter, lengths=None):
t = trie.trie()
seqs = list(counter)
seqs.sort(key=len, reverse=True)
if lengths is None:
lengths = sorted(set([len(k) for k in seqs]))
for seq in seqs:
seq_len = len(seq)
for l in lengths:
if l > seq_len: continue
for subseq in chunker(seq, l):
if t.has_key(subseq): continue
if subseq == seq:
t[seq] = counter[seq]
else:
t[subseq] = seq
return t
def process_exact_substring(counter, t):
"""use triefind.find to gather identical substring matches"""
seqs = list(counter)
seqs.sort(key=len, reverse=True)
for seq in seqs:
l = len(seq)
for (match, start, end) in triefind.find(seq, t):
if len(match) == l: continue
counter[seq] += counter[match]
counter[match] = 0
counter += Counter()
return counter
def unique_everseen(iterable, key=None):
seen = set()
seen_add = seen.add
if key is None:
for element in ifilterfalse(seen.__contains__, iterable):
seen_add(element)
yield element
else:
for element in iterable:
k = key(element)
if k not in seen:
seen_add(k)
yield element
def process_similar(counter, t, n):
"""trie is composed of sequences being compared."""
seqs = list(counter)
seqs.sort(key=len, reverse=True)
lengths = sorted(set([len(k) for k in seqs]))
progress = 100
to_process = len(seqs)
for i, seq in enumerate(seqs, start=1):
if i % progress == 0:
progress = int(progress * 1.5)
print >>sys.stderr, "processed %d of %d" % (i, to_process)
if counter[seq] == 0: continue
for (k, v, dist) in unique_everseen(t.get_approximate(seq, n), lambda (m,c,d): m):
if dist == 0 or k == seq: continue
if type(v) is int:
counter[seq] += counter[k]
counter[k] = 0
else:
# k is a subsequence; therefore add seq to v
counter[v] += counter[seq]
counter[seq] = 0
counter += Counter()
return counter
def process_similar_matrix(bins, seqs, t, n):
"""
bins - sequence bins
seqs - sequences to bin
t - trie
n - mismatches
returns Counter
"""
sample_seqs = list(seqs)
sample_seqs.sort(key=len, reverse=True)
to_process = len(sample_seqs)
progress = 100
for i, seq in enumerate(sample_seqs, start=1):
if i % progress == 0:
progress = int(progress * 1.5)
print >>sys.stderr, " >> processed {i} of {to_process}".format(**locals())
# returning bins to which the sequence belongs
for (k, v, dist) in unique_everseen(t.get_approximate(seq, n), lambda (m,c,d): m):
if type(v) is int:
bins[k] += seqs[seq]
# set to zero? avoids adding counts to multiple bins
seqs[seq] = 0
else:
bins[v] += seqs[seq]
seqs[seq] = 0
return bins
def scalefactor(counts):
# mask inf and nan
ma = np.ma.masked_invalid(counts)
return np.exp(np.ma.median(ma))
def write_table(d, library_sizes, norm=None):
if norm == "deseq":
# details: http://genomebiology.com/2010/11/10/R106
df = pd.DataFrame(d)
# log of counts
lg = df.apply(np.log)
# per sample: exponential(median(log(counts) - geometric mean))
sf = lg.sub(lg.mean(axis=1), axis=0).apply(scalefactor, axis=0)
# apply scaling
df = df.div(sf, axis=1)
elif norm == "totalcount":
df = pd.DataFrame(d)
mean_total_count = float(sum(library_sizes.values())) / len(library_sizes)
# apply total count scaling
# df = df.apply(lambda x: (x / x.sum()) * mean_total_count)
for col in df.columns:
denominator = float(library_sizes[col])
assert denominator > 0, \
"No counts found in sample {sampleid}".format(sampleid=col)
df[col] = (df[col] / denominator) * mean_total_count
else:
df = pd.DataFrame(d)
df.to_csv(sys.stdout, sep="\t")
def run_quantify(args):
print >>sys.stderr, ">> collapsing identical sequences (1/5)"
reads = process_exact_fastq(args.fastq, args.cutoff)
print >>sys.stderr, ">> constructing suffix tree (2/5)"
t = construct_simple_trie(reads)
print >>sys.stderr, ">> collapsing identical subsequences (3/5)"
reads = process_exact_substring(reads, t)
print >>sys.stderr, ">> optimizing suffix tree (4/5)"
t = construct_complex_trie(reads)
print >>sys.stderr, ">> collapsing similar sequences (5/5)"
reads = process_similar(reads, t, args.mismatch)
for seq, count in reads.iteritems():
print "%s\t%d" % (seq, count)
def run_consensus(args):
"""does basically the same thing as quantify, except doesn't print out the
count.
"""
print >>sys.stderr, ">> collapsing identical sequences (1/5)"
seqs = process_exact_txt(args.bins, args.cutoff)
print >>sys.stderr, ">> constructing suffix tree (2/5)"
t = construct_simple_trie(seqs)
print >>sys.stderr, ">> collapsing identical subsequences (3/5)"
seqs = process_exact_substring(seqs, t)
print >>sys.stderr, ">> optimizing suffix tree (4/5)"
t = construct_complex_trie(seqs)
print >>sys.stderr, ">> collapsing similar sequences (5/5)"
seqs = process_similar(seqs, t, args.mismatch)
s = list(seqs)
s.sort(key=len, reverse=True)
print "\n".join(s)
def run_matrix(args):
d = {}
samples = set()
to_process = len(args.counts)
library_sizes = {}
for i, f in enumerate(args.counts, start=1):
sample = op.splitext(op.basename(f))[0]
samples.add(sample)
assert len(samples) == i
print >>sys.stderr, (">> processing sample {sample} "
"({i}/{to_process})").format(
sample=sample,
i=i,
to_process=to_process)
d[sample] = {}
# the sequence counts of current sample and total library size
seqs, library_size = process_counted(f, sample, args.cutoff)
library_sizes[sample] = library_size
seq_lengths = sorted(set([len(k) for k in list(seqs)]))
seq_bins = get_seq_bins(args.consensus)
# trie based on sequences of bins at lengths of query sequences
t = construct_complex_trie(seq_bins, seq_lengths)
# process the sequences
counts = process_similar_matrix(seq_bins, seqs, t, args.mismatch)
for k, v in counts.iteritems():
d[sample][k] = v
write_table(d, library_sizes, args.norm)
def main(args):
args.func(args)
if __name__ == '__main__':
import argparse
p = argparse.ArgumentParser(description=__doc__, version=__version__)
subp = p.add_subparsers(help='commands')
fquant = subp.add_parser('quantify',
description=("Find and quantify unique and similar sequences "
"within a FASTQ."),
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
help="quantify unique and similar sequences")
fquant.add_argument("fastq", metavar="FASTQ", help="reads to process")
fquant.add_argument("-c", dest="cutoff", type=int, default=18,
help="minimum allowable seq length")
fquant.add_argument("-m", dest="mismatch", type=int, default=3,
help="mismatch tolerance when grouping bins")
fquant.set_defaults(func=run_quantify)
fcons = subp.add_parser('consensus',
description="Build consensus of sequences across all samples.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
help="build observed sequence library")
fcons.add_argument('bins', metavar='BINS', nargs="+",
help="results of `quantify`")
fcons.add_argument('-c', dest='cutoff', type=int, default=100,
help="minimum allowable count")
fcons.add_argument('-m', dest='mismatch', type=int, default=3,
help="mismatch tolerance when grouping bins")
fcons.set_defaults(func=run_consensus)
fmat = subp.add_parser('matrix', description="Generate counts matrix",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
help="generate counts matrix")
fmat.add_argument("consensus", metavar="CONSENSUS",
help="result of `consensus`")
fmat.add_argument("counts", metavar="COUNTS", nargs="+",
help="results of `quantify`")
fmat.add_argument('-c', dest='cutoff', type=int, default=100,
help=("minimum allowable count for individual sample sequences"))
fmat.add_argument("-m", dest="mismatch", type=int, default=3,
help="mismatch tolerance when grouping bins")
fmat.add_argument("-n", dest="norm", default=None, choices=['deseq', 'totalcount'],
help=("output normalized table using either DESeq or total count method"))
fmat.set_defaults(func=run_matrix)
args = p.parse_args()
main(args)