-
Notifications
You must be signed in to change notification settings - Fork 1
/
domestication.py
551 lines (469 loc) · 20.3 KB
/
domestication.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
# Copyright 2013 Diego Orzaez, Univ.Politecnica Valencia, Consejo Superior de
# Investigaciones Cientificas
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
import re
from itertools import izip_longest
from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from goldenbraid.settings import (DOMESTICATION_DEFAULT_MELTING_TEMP,
DOMESTICATION_MIN_OLIGO_LENGTH,
DOMEST_VECTOR_PREFIX,
DOMEST_VECTOR_SUFFIX,
OLIGO_UNIVERSAL, DOMESTICATED_SEQ,
MINIMUN_PCR_LENGTH, CRYSPER_SEQ,
DOMESTICATED_VECTOR,
MANDATORY_DOMEST_ENZYMES)
from goldenbraid.models import Feature, Count
from Bio.SeqFeature import FeatureLocation, CompoundLocation, SeqFeature
from goldenbraid.tags import (TARGET_MONOCOT, TARGET_DICOT, CDS, CDS1_CDS2,
NTAG, CDS1)
from goldenbraid.utils import (get_ret_sites, has_rec_sites,
get_prefix_and_suffix_index)
def get_codontable():
'get codontable'
bases = ['T', 'C', 'A', 'G']
codons = [a + b + c for a in bases for b in bases for c in bases]
amino_acids = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDE'
amino_acids += 'EGGGG'
codon_table = dict(zip(codons, amino_acids))
return codon_table
def domesticate_for_synthesis(seqrec, category, prefix, suffix, enzymes,
with_intron=False):
kind = category
seq = seqrec.seq
if not with_intron:
seq = seq.upper()
if not enzymes:
enzymes = MANDATORY_DOMEST_ENZYMES
new_seq = _remove_rec_sites(seq, enzymes)[0]
seqs_for_sintesis, prefix, suffix = _add_tags_to_pcrproducts([new_seq],
prefix,
suffix,
kind)
try:
count = Count.objects.get(name=DOMESTICATED_SEQ)
except Count.DoesNotExist:
count = Count.objects.create(name=DOMESTICATED_SEQ, value=1)
next_value = count.next
seq_name = DOMESTICATED_SEQ + '_' + next_value
vector_seq = _get_stripped_vector_seq()
prepared_new_seq = prefix + new_seq + suffix + vector_seq
seq_for_synthesis = str(seqs_for_sintesis[0])
prepared_seq = SeqRecord(prepared_new_seq, name=seq_name, id=seq_name)
start = len(prefix)
part_feat = SeqFeature(FeatureLocation(start, len(new_seq) + start),
type='misc_feature', id=prepared_seq.id)
prepared_seq.features.append(part_feat)
return seq_for_synthesis, prepared_seq
def domesticate(seqrec, category, prefix, suffix, enzymes=None, with_intron=False):
kind = category
seq = seqrec.seq
if not with_intron:
seq = seq.upper()
min_melting_temp = DOMESTICATION_DEFAULT_MELTING_TEMP
if not enzymes:
enzymes = MANDATORY_DOMEST_ENZYMES
new_seq, rec_site_pairs, fragments = _remove_rec_sites(seq, enzymes)
segments = _get_pcr_segments(new_seq, rec_site_pairs, fragments)
pcr_products = [str(new_seq[s['start']:s['end'] + 1]) for s in segments]
oligos = _get_oligos(new_seq, segments, min_melting_temp)
oligos = _add_tags_to_oligos(oligos, prefix, suffix, kind)
# coprobar que los overhangs son distintos posiciones 12-15
forw_bin_sites = []
for oligo in oligos:
olig_for = oligo[0]
for_bin = olig_for[11:15]
if for_bin in forw_bin_sites:
raise RuntimeError('Repeated overhang')
forw_bin_sites.append(for_bin)
# print oligos
pcr_products, prefix, suffix = _add_tags_to_pcrproducts(pcr_products,
prefix, suffix,
kind)
oligo_pcrs = []
for pcr, oligo in zip(pcr_products, oligos):
oligo_pcrs.append({'pcr_product': pcr, 'oligo_forward': oligo[0],
'oligo_reverse': oligo[1]})
try:
count = Count.objects.get(name=DOMESTICATED_SEQ)
except Count.DoesNotExist:
count = Count.objects.create(name=DOMESTICATED_SEQ, value=1)
next_value = count.next
vector_seq = _get_stripped_vector_seq()
prepared_new_seq = prefix + new_seq + suffix + vector_seq
seq_name = DOMESTICATED_SEQ + '_' + next_value
new_seq_record = SeqRecord(prepared_new_seq, name=seq_name, id=seq_name)
start = len(prefix)
part_feat = SeqFeature(FeatureLocation(start, len(new_seq) + start),
type='misc_feature', id=new_seq_record.id)
new_seq_record.features.append(part_feat)
if with_intron:
cds = _get_cds_from_seq(seq, prefix)
new_seq_record.features.append(cds)
return oligo_pcrs, new_seq_record
def _get_cds_from_seq(seq, prefix):
cds_locs = []
for match in re.finditer('[A-Z]+', str(seq)):
cds_locs.append(FeatureLocation(match.start() + len(prefix),
match.end() + len(prefix), strand=1))
qualifiers = {'translation': Seq(_get_upper_nucls(seq)).translate()}
cds = SeqFeature(CompoundLocation(cds_locs), type='CDS', strand=1,
qualifiers=qualifiers)
return cds
def _get_oligos(seq, segments, min_melting_temp):
oligos = []
for segment in segments:
forward_min = segment.get('forward_min', None)
if forward_min:
forward_min = forward_min - segment['start'] + 1
forw_oligo = _get_oligo(seq[segment['start']:], min_melting_temp,
forward_min)
reverse_min = segment.get('reverse_min', None)
if reverse_min:
reverse_min = segment['end'] - reverse_min
rev_oligo = _get_oligo(seq[:segment['end'] + 1].reverse_complement(),
min_melting_temp, reverse_min)
oligos.append((forw_oligo, rev_oligo))
return oligos
def _get_pcr_segments(seq, rec_sites, fragments):
segments = {'starts': [], 'ends': []}
segments['starts'].append(0)
acumulated_seq_len = 0
# we can not use this hovergangs in our segmnets
overhangs = [DOMEST_VECTOR_PREFIX, DOMEST_VECTOR_SUFFIX]
frag_rec_sites = zip(fragments, rec_sites)
for index, frag_5_rec_site in enumerate(frag_rec_sites):
frag_5 = frag_5_rec_site[0]
rec_site = frag_5_rec_site[1]
try:
frag_3 = fragments[index + 1]
except IndexError:
frag_3 = None
start, end, overhangs = _get_segments_from_rec_site(frag_5, frag_3,
rec_site,
acumulated_seq_len,
overhangs)
segments['starts'].append(start)
segments['ends'].append(end)
acumulated_seq_len += len(frag_5) + len(rec_site['modified'])
segments['ends'].append(len(seq) - 1)
segments = zip(segments['starts'], segments['ends'])
return _join_segments(segments)
def _join_segments(segments, min_length=MINIMUN_PCR_LENGTH):
# join short segments
segments = [{'start': s[0], 'end': s[1]} for s in segments]
while not _all_segments_ok(segments, min_length):
segments = _join_short_segments(segments, min_length)
return segments
def _all_segments_ok(segments, min_length):
for segment in segments:
if segment['end'] - segment['start'] < min_length:
return False
return True
def _join_short_segments(segments, min_length):
len_segments = len(segments)
joined_segments = []
skip_segment = False
for index, segment in enumerate(segments):
start = segment['start']
end = segment['end']
if end - start < min_length:
# if not last segment
if index + 1 != len_segments:
if skip_segment:
joined_segments[-1]['end'] = segments[index + 1]['end']
joined_segments[-1]['forward_min'] = end + 8
new_segment = None
else:
new_segment = {'start': start,
'end': segments[index + 1]['end'],
'forward_min': end + 8}
else:
joined_segments[-1]['end'] = end
joined_segments[-1]['reverse_min'] = start - 8
new_segment = None
skip_segment = True
else:
if skip_segment:
skip_segment = False
continue
new_segment = {'start': start, 'end': end}
if new_segment:
joined_segments.append(new_segment)
return joined_segments
def is_dna_palindrome(seq):
nucl_palindromes = [('A', 'T'), ('T', 'A'), ('C', 'G'), ('G', 'C')]
seq_pals = []
if divmod(len(seq), 2)[1] != 0:
return False
for num in range(0, len(seq)):
rev_num = len(seq) - 1 - num
nucl_a = seq[num]
nucl_b = seq[rev_num]
if (nucl_a, nucl_b) in nucl_palindromes:
seq_pals.append(True)
else:
seq_pals.append(False)
return all(seq_pals)
def _get_segments_from_rec_site(frag_5, frag_3, rec_site, prev_seq_len,
overhangs):
change_pos = 0
for letter1, letter2 in zip(rec_site['original'], rec_site['modified']):
if letter1 != letter2:
break
change_pos += 1
change_index = prev_seq_len + len(frag_5) + change_pos
fow_end = change_index + 1
rev_start = fow_end - 3
overhang = get_overhang(rev_start, fow_end, prev_seq_len, frag_5, frag_3,
rec_site)
if is_dna_palindrome(overhang):
fow_end = change_index + 2
rev_start = fow_end - 3
overhang = get_overhang(rev_start, fow_end, prev_seq_len, frag_5,
frag_3, rec_site)
count = 0
overhang_rev_comp = str(Seq(overhang).reverse_complement())
while (overhang in overhangs or overhang_rev_comp in overhangs or
is_dna_palindrome(overhang)):
rev_start += 1
fow_end += 1
overhang = get_overhang(rev_start, fow_end, prev_seq_len, frag_5,
frag_3, rec_site)
overhang_rev_comp = str(Seq(overhang).reverse_complement())
if count > 10:
msg = 'Impossible to domesticate this sequence\n:'
msg += 'Domesticated rec site nucleotide is too far from oligo'
msg += ' start'
raise RuntimeError(msg)
count += 1
overhangs.append(overhang)
return rev_start, fow_end, overhangs
def get_overhang(rev_start, fow_end, prev_seq_len, frag_5, frag_3, rec_site):
overhang_start = rev_start - prev_seq_len - len(frag_5)
overhang_end = fow_end - prev_seq_len - len(frag_5)
overhang = rec_site['modified'][overhang_start:overhang_end + 1]
# si es el ultimo no pasa por aqui
if frag_3 is not None:
index = 0
while len(overhang) < 4:
overhang += frag_3[index].upper()
return overhang
def _get_stripped_vector_seq():
pupd = Feature.objects.get(uniquename=DOMESTICATED_VECTOR)
vec_seq = pupd.residues
pre_suf_size = get_prefix_and_suffix_index(vec_seq, pupd.enzyme_in[0])
prefix_index, suffix_index, prefix_size = pre_suf_size
prefix_start = prefix_index
suffix_end = suffix_index + prefix_size
if prefix_start > suffix_end:
stripped_seq = vec_seq[prefix_start:]
stripped_seq += vec_seq[:suffix_end]
else:
stripped_seq = vec_seq[prefix_start:suffix_end]
return stripped_seq
def _guess_prefix_suffix_tag(kind, prefix, suffix):
'''It select the needed prefix and suffix to add to oligos and
pcr_products for especial category cases'''
if kind == CDS:
prefix = 'A'
elif kind == CDS1_CDS2:
prefix = 'A'
suffix = 'GGTTCG'
elif kind == CDS1:
prefix = 'A'
suffix = 'GCAGCC'
elif kind == NTAG:
prefix = 'CC'
suffix = 'TCAATG'
return prefix, suffix
def _add_tags_to_pcrproducts(pcr_products, prefix, suffix, kind):
pcr_products_with_tags = []
prefix, suffix = _guess_prefix_suffix_tag(kind, prefix, suffix)
len_pcr = len(pcr_products)
for index, pcr_product in enumerate(pcr_products):
pcr_tag = OLIGO_UNIVERSAL
if index == 0:
pcr_tag += DOMEST_VECTOR_PREFIX + prefix
pcr_tag += pcr_product
if index + 1 == len_pcr:
pcr_tag += suffix + str(Seq(DOMEST_VECTOR_SUFFIX).reverse_complement())
pcr_tag += str(Seq(OLIGO_UNIVERSAL).reverse_complement())
pcr_products_with_tags.append(pcr_tag.upper())
return pcr_products_with_tags, prefix, suffix
def _add_tags_to_oligos(oligos, prefix, suffix, kind):
oligos_with_tags = []
prefix, suffix = _guess_prefix_suffix_tag(kind, prefix, suffix)
suffix = str(Seq(suffix).reverse_complement())
len_oligos = len(oligos)
for index, oligo_pair in enumerate(oligos):
oligo_tag5 = OLIGO_UNIVERSAL
if index == 0:
oligo_tag5 += DOMEST_VECTOR_PREFIX + prefix
oligo_tag5 += oligo_pair[0]
oligo_tag3 = OLIGO_UNIVERSAL
if index + 1 == len_oligos:
oligo_tag3 += DOMEST_VECTOR_SUFFIX + suffix
oligo_tag3 += oligo_pair[1]
oligos_with_tags.append((oligo_tag5.upper(), oligo_tag3.upper()))
return oligos_with_tags
def _get_oligo(seq, min_melting_temp, min_length=None):
'Giving a seq and a melting temperature it return the longest oligo'
if not min_length or min_length < DOMESTICATION_MIN_OLIGO_LENGTH:
min_length = DOMESTICATION_MIN_OLIGO_LENGTH
oligo = []
for index in range(min_length, len(seq)):
oligo = seq[:index]
if _calculate_annealing_temp(oligo) >= min_melting_temp:
break
return str(oligo)
def _calculate_annealing_temp(seq):
# from http://www.basic.northwestern.edu/biotools/oligocalc.html
# Tm (C)= 64.9 +41*(yG+zC-16.4)/(wA+xT+yG+zC)
seq = seq.upper()
len_seq = len(seq)
return 64.9 + 41 * (seq.count('G') + seq.count('C') - 16.4) / len_seq
def _remove_rec_sites(seq, enzymes=None):
'''It modifies all rec sites in the sequence to be able to use with
goldenbraid pipeline'''
if enzymes is None:
enzymes = MANDATORY_DOMEST_ENZYMES
rec_sites = get_ret_sites(enzymes)
# regex with the sites to domesticate
rec_sites_regex = '(' + '|'.join(rec_sites) + ')'
rec_sites_regex = re.compile(rec_sites_regex, flags=re.IGNORECASE)
rec_sites_in_seq = []
fragments = []
for splitted_part in rec_sites_regex.split(str(seq)):
if rec_sites_regex.match(splitted_part):
rec_sites_in_seq.append(splitted_part)
else:
fragments.append(splitted_part)
new_seq = Seq('', alphabet=generic_dna)
# we can not convert a rec site in another rec site
_cumulative_patch = '' # it is only used to know the frame
rec_site_pairs = []
for fragment, rec_site_in_seq in izip_longest(fragments, rec_sites_in_seq):
new_seq += fragment
if rec_site_in_seq is not None:
_cumulative_patch += fragment + rec_site_in_seq
new_rec_site = _domesticate_rec_site(rec_site_in_seq,
_cumulative_patch,
rec_sites_regex)
rec_site_pairs.append({'original': rec_site_in_seq,
'modified': new_rec_site})
new_seq += new_rec_site
coding_seq = Seq(_get_upper_nucls(seq))
new_coding_seq = Seq(_get_upper_nucls(new_seq))
if str(coding_seq.translate()) != str(new_coding_seq.translate()):
msg = 'The generated sequence does not produce the same peptide'
raise ValueError(msg)
if rec_sites_regex.search(str(new_seq)):
msg = 'Not all rec_sites modified'
raise ValueError(msg)
return new_seq, rec_site_pairs, fragments
def _get_upper_nucls(seq):
return ''.join([nucl for nucl in seq if nucl.isupper()])
def change_nucl_in_intron_rec_site(rec_site, rec_sites_regex):
for index, nucl in enumerate(rec_site):
if nucl.islower():
for new_nucl in ('a', 't', 'c', 'g'):
new_rec_site = rec_site[:index] + new_nucl
if index < (len(rec_site) - 1):
new_rec_site += rec_site[index + 1:]
if not rec_sites_regex.match(new_rec_site):
return new_rec_site
def _domesticate_rec_site(rec_site, patch, rec_sites_regex):
'''it converts a rec site in a disabled rec_site. It changes one nucleotide
but tries not to change aa.
It can not convert in an already unusable rec_site'''
with_intron = False
for letter in rec_site:
if letter.islower():
with_intron = True
if with_intron:
return change_nucl_in_intron_rec_site(rec_site, rec_sites_regex)
# get a dictionary for codon_table
codon_table = get_codontable()
# get the last complete codon
lastcodon = ''
baseindex_to_change = ''
coding_patch = _get_upper_nucls(patch)
frame = divmod(len(coding_patch), 3)[1] + 1
if frame == 1:
baseindex_to_change = -1
lastcodon = coding_patch[-3:]
elif frame == 2:
baseindex_to_change = -2
lastcodon = coding_patch[-4:-1]
elif frame == 3:
baseindex_to_change = -3
lastcodon = coding_patch[-5:-2]
else:
raise ValueError()
# if lastcodon is Metionine, change the previous codon,
# since Met does not have alternative codon
if lastcodon == 'ATG':
if rec_site == 'GCGATG':
baseindex_to_change = -4
lastcodon = coding_patch[-6:-3]
# get alternative codons (same aminoacid) for the lastcodon
alt_codons = []
for codon in codon_table.keys():
if codon == lastcodon:
continue
if codon_table.get(codon) == codon_table.get(lastcodon):
if codon[0] == lastcodon[0]:
if codon[1] == lastcodon[1]:
alt_codons.append(codon)
# select one alternative codon
newsite = ''
for alt_codon in alt_codons:
newbase = alt_codon[2]
newsite += rec_site[0:baseindex_to_change]
newsite += newbase
if baseindex_to_change < -1:
newsite += rec_site[(baseindex_to_change + 1):]
# check that new site is not one of the already domesticated sites
return newsite
# if we reach this is because no allowed domesticated site has been found
raise ValueError('No domestication possible for ORF site ' + rec_site)
# crisprs
def domestication_crispr(seq, category=None, prefix=None, suffix=None):
if len(seq) < 20:
raise ValueError('Seq length must be at least 20')
if len(seq) != 20 and category:
msg = 'To domesticate with the given target type, the CRISPR target '
msg += 'size must be 20'
raise ValueError(msg)
if category == TARGET_DICOT and str(seq[0]).upper() != 'G':
raise ValueError('First nucleotide must be G for target dicot category')
if category == TARGET_MONOCOT and str(seq[0]).upper() != 'A':
raise ValueError('First nucleotide must be G for target monocot category')
if has_rec_sites(seq):
msg = 'This secuence can not be domesticated. It has internal restriction sites'
raise ValueError(msg)
if category:
prefix = prefix[:3]
try:
count = Count.objects.get(name=CRYSPER_SEQ)
except Count.DoesNotExist:
count = Count.objects.create(name=CRYSPER_SEQ, value=1)
next_value = count.next
prepared_seq = Seq(prefix + seq + suffix)
seq_name = CRYSPER_SEQ + '_' + next_value
new_seq_record = SeqRecord(prepared_seq, name=seq_name, id=seq_name)
return new_seq_record