/
seqdistance_old.py
703 lines (599 loc) · 26.1 KB
/
seqdistance_old.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
from functools import *
import itertools
import operator
from Bio import SeqIO, pairwise2
from Bio.SubsMat.MatrixInfo import blosum90, ident, blosum62
from copy import deepcopy
import sys
import numpy as np
import numba as nb
import re
from sklearn import manifold
from sklearn.metrics import euclidean_distances
from sklearn.decomposition import PCA, KernelPCA
from sklearn import cluster
from sklearn.manifold import Isomap
import tsne
import pytsne
__all__ = ['BADAA',
'AALPHABET',
'AA2CODE',
'CODE2AA',
'isvalidpeptide',
'removeBadAA',
'hamming_distance',
'trunc_hamming',
'dichot_hamming',
'seq2vec',
'nanGapScores',
'nanZeroGapScores',
'binGapScores',
'blosum90GapScores',
'binarySubst',
'addGapScores',
'seq_similarity',
'seq_distance',
'seq_similarity_old',
'unalign_similarity',
'_test_seq_similarity',
'calcDistanceMatrix',
'calcDistanceRectangle',
'blosum90',
'ident',
'blosum62',
'embedDistanceMatrix']
BADAA = '-*BX#Z'
FULL_AALPHABET = 'ABCDEFGHIKLMNPQRSTVWXYZ-'
AALPHABET = 'ACDEFGHIKLMNPQRSTVWY'
AA2CODE = {aa:i for i, aa in enumerate(FULL_AALPHABET)}
AA2CODE.update({'-':23})
CODE2AA = {i:aa for i, aa in enumerate(FULL_AALPHABET)}
CODE2AA.update({23:'-'})
def subst2mat(subst,alphabet = FULL_AALPHABET):
"""Converts a substitution dictionary
(like those from Bio) into a numpy 2d substitution matrix"""
mat = np.nan * np.zeros((len(alphabet), len(alphabet)), dtype = np.float64)
for (aa1, aa2), v in list(subst.items()):
mat[alphabet.index(aa1), alphabet.index(aa2)] = v
return mat
"""Many different ways of handling gaps. Remember that these are SIMILARITY scores"""
nanGapScores={('-', '-'):np.nan,
('-', 'X'):np.nan,
('X', '-'):np.nan}
nanZeroGapScores={('-', '-'):np.nan,
('-', 'X'):0,
('X', '-'):0}
"""Default for addGapScores()"""
binGapScores={('-', '-'):1,
('-', 'X'):0,
('X', '-'):0}
"""Arbitrary/reasonable values (extremes for blosum90 I think)"""
blosum90GapScores={('-', '-'):5,
('-', 'X'):-11,
('X', '-'):-11}
binarySubst = {(aa1, aa2):np.float64(aa1==aa2) for aa1, aa2 in itertools.product(FULL_AALPHABET, FULL_AALPHABET)}
identMat = subst2mat(ident)
blosum90Mat = subst2mat(blosum90)
blosum62Mat = subst2mat(blosum62)
binaryMat = subst2mat(binarySubst)
def isvalidpeptide(mer,badaa=None):
"""Test if the mer contains an BAD amino acids in global BADAA
typically -*BX#Z"""
if badaa is None:
badaa = BADAA
if not mer is None:
return not re.search('[%s]' % badaa, mer)
else:
return False
def removeBadAA(mer,badaa=None):
"""Remove badaa amino acids from the mer, default badaa is -*BX#Z"""
if badaa is None:
badaa = BADAA
if not mer is None:
return re.sub('[%s]' % badaa, '', mer)
else:
return mer
def hamming_distance(str1, str2, noConvert = False, **kwargs):
"""Hamming distance between str1 and str2.
Only finds distance over the length of the shorter string.
**kwargs are so this can be plugged in place of a seq_distance() metric"""
if noConvert:
return np.sum([i for i in map(operator.__ne__, str1, str2)])
if isinstance(str1, str):
str1 = string2byte(str1)
if isinstance(str2, str):
str2 = string2byte(str2)
return nb_hamming_distance(str1, str2)
def aamismatch_distance(seq1,seq2, **kwargs):
if isinstance(seq1, str):
seq1 = seq2vec(seq1)
if isinstance(seq2, str):
seq2 = seq2vec(seq2)
dist12 = nb_seq_similarity(seq1, seq2, substMat = binaryMat, normed = False, asDistance = True)
return dist12
def string2byte(s):
"""Convert string to byte array since numba can't handle strings"""
if isinstance(s, str):
s = np.array(s)
dtype = s.dtype
if dtype is np.dtype('byte'):
return s # it's already a byte array
shape = list(s.shape)
n = dtype.itemsize
shape.append(n)
return s.ravel().view(dtype='byte').reshape(shape)
def seq2vec(seq):
"""Convert AA sequence into numpy vector of integers for fast comparison"""
vec = np.zeros(len(seq), dtype = np.int8)
for aai, aa in enumerate(seq):
vec[aai] = AA2CODE[aa]
return vec
def seqs2mat(seqs):
"""Convert a collection of AA sequences into a
numpy matrix of integers for fast comparison.
Requires all seqs to have the same length."""
L1 = len(seqs[0])
mat = np.zeros((len(seqs), L1), dtype = np.int8)
for si, s in enumerate(seqs):
assert L1 == len(s), "All sequences must have the same length: L1 = %d, but L%d = %d" % (L1, si, len(s))
for aai, aa in enumerate(s):
mat[si, aai] = AA2CODE[aa]
return mat
@nb.jit(nb.int8(nb.char[:], nb.char[:]), nopython = True)
def nb_hamming_distance(str1, str2):
tot = 0
for s1, s2 in zip(str1, str2):
if s1 != s2:
tot += 1
return tot
def trunc_hamming(seq1,seq2,maxDist=2,**kwargs):
"""Truncated hamming distance
d = hamming() if d<maxDist else d = maxDist"""
d = hamming_distance(seq1, seq2)
return maxDist if d >= maxDist else d
def dichot_hamming(seq1,seq2,mmTolerance=1,**kwargs):
"""Dichotamized hamming distance.
hamming <= mmTolerance is 0 and all others are 1"""
d = hamming_distance(seq1, seq2)
return 1 if d > mmTolerance else 0
def addGapScores(subst, gapScores = None, minScorePenalty = False, returnMat = False):
"""Add gap similarity scores for each AA (Could be done once for a set of sequences to improve speed)
if gapScores is None then it will use defaults:
gapScores={('-','-'):1,
('-','X'):0,
('X','-'):0}
OR for blosum90 default is:
blosum90GapScores={('-','-'):5,
('-','X'):-11,
('X','-'):-11}
"""
if minScorePenalty:
gapScores = {('-', '-') : 1,
('-', 'X') : np.min(list(subst.values())),
('X', '-') : np.min(list(subst.values()))}
elif gapScores is None:
if subst is binarySubst:
print('Using default binGapScores for binarySubst')
gapScores = binGapScores
elif subst is blosum90:
print('Using default blosum90 gap scores')
gapScores = blosum90GapScores
else:
raise Exception('Cannot determine which gap scores to use!')
su = deepcopy(subst)
uAA = np.unique([k[0] for k in list(subst.keys())])
su.update({('-', aa) : gapScores[('-', 'X')] for aa in uAA})
su.update({(aa, '-') : gapScores[('X', '-')] for aa in uAA})
su.update({('-', '-') : gapScores[('-', '-')]})
if returnMat:
return subst2mat(su)
return su
#@nb.jit(nb.float64(nb.int8[:],nb.int8[:],nb.float64[:,:],nb.boolean,nb.boolean), nopython = True)
@nb.jit(nopython = True)
def nb_seq_similarity(seq1, seq2, substMat, normed, asDistance):
"""Computes sequence similarity based on the substitution matrix."""
if seq1.shape[0] != seq2.shape[0]:
raise IndexError
if normed or asDistance:
sim12 = 0.
siteN = 0.
sim11 = 0.
sim22 = 0.
for i in range(seq1.shape[0]):
cur12 = substMat[seq1[i], seq2[i]]
cur11 = substMat[seq1[i], seq1[i]]
cur22 = substMat[seq2[i], seq2[i]]
if not np.isnan(cur12):
sim12 += cur12
siteN += 1.
if not np.isnan(cur11):
sim11 += cur11
if not np.isnan(cur22):
sim22 += cur22
sim12 = 2*sim12/((sim11/siteN) + (sim22/siteN))
else:
sim12 = 0.
siteN = 0.
for i in range(seq1.shape[0]):
if not np.isnan(substMat[seq1[i], seq2[i]]):
sim12 += substMat[seq1[i], seq2[i]]
siteN += 1.
if asDistance:
if normed:
sim12 = (siteN - sim12)/siteN
else:
sim12 = siteN - sim12
return sim12
def np_seq_similarity(seq1, seq2, substMat, normed, asDistance):
"""Computes sequence similarity based on the substitution matrix."""
if seq1.shape[0] != seq2.shape[0]:
raise IndexError("Sequences must be the same length (%d != %d)." % (seq1.shape[0], seq2.shape[0]))
"""Similarity between seq1 and seq2 using the substitution matrix subst"""
sim12 = substMat[seq1, seq2]
if normed or asDistance:
siteN = (~np.isnan(sim12)).sum()
sim11 = np.nansum(substMat[seq1, seq1])/siteN
sim22 = np.nansum(substMat[seq1, seq1])/siteN
tot12 = np.nansum(2*sim12)/(sim11+sim22)
else:
tot12 = np.nansum(sim12)
if asDistance:
"""Distance between seq1 and seq2 using the substitution matrix subst
because seq_similarity returns a total similarity with max of siteN (not per site), we use
d = siteN - sim
which is a total normed distance, not a per site distance"""
if normed:
tot12 = (siteN - tot12)/siteN
else:
tot12 = siteN - tot12
return tot12
def seq_similarity(seq1, seq2, subst = None, normed = True, asDistance = False):
"""Compare two sequences and return the similarity of one and the other
If the seqs are of different length then it raises an exception
FOR HIGHLY DIVERGENT SEQUENCES THIS NORMALIZATION DOES NOT GET TO [0,1] BECAUSE OF EXCESS NEGATIVE SCORES!
Consider normalizing the matrix first by adding the min() so that min = 0 (but do not normalize per comparison)
Return a nansum of site-wise similarities between two sequences based on a substitution matrix
[0, siteN] where siteN ignores nan similarities which may depend on gaps
sim12 = nansum(2*sim12/(nanmean(sim11) + nanmean(sim22))
Optionally specify normed = False:
[0, total raw similarity]
This returns a score [0, 1] for binary and blosum based similarities
otherwise its just the sum of the raw score out of the subst matrix"""
if subst is None:
print('Using default binarySubst matrix with binGaps for seq_similarity')
subst = addGapScores(binarySubst, binGapScores)
if isinstance(subst, dict):
subst = subst2mat(subst)
if isinstance(seq1, str):
seq1 = seq2vec(seq1)
if isinstance(seq2, str):
seq2 = seq2vec(seq2)
result = np_seq_similarity(seq1, seq2, substMat = subst, normed = normed, asDistance = asDistance)
return result
def seq_similarity_old(seq1,seq2,subst=None,normed=True):
"""Compare two sequences and return the similarity of one and the other
If the seqs are of different length then it raises an exception
FOR HIGHLY DIVERGENT SEQUENCES THIS NORMALIZATION DOES NOT GET TO [0,1] BECAUSE OF EXCESS NEGATIVE SCORES!
Consider normalizing the matrix first by adding the min() so that min = 0 (but do not normalize per comparison)
Return a nansum of site-wise similarities between two sequences based on a substitution matrix
[0, siteN] where siteN ignores nan similarities which may depend on gaps
sim12 = nansum(2*sim12/(nanmean(sim11) + nanmean(sim22))
Optionally specify normed = False:
[0, total raw similarity]
This returns a score [0, 1] for binary and blosum based similarities
otherwise its just the sum of the raw score out of the subst matrix
For a hamming similarity when there are no gaps use subst=binarySubst
and performance is optimized underneath using hamming_distance"""
assert len(seq1) == len(seq2), "len of seq1 (%d) and seq2 (%d) are different" % (len(seq1), len(seq2))
if subst is binarySubst:
dist = hamming_distance(seq1, seq2)
sim = len(seq1) - dist
if normed:
sim = sim / len(seq1)
return sim
if subst is None:
print('Using default binarySubst matrix with binGaps for seq_similarity')
subst = addGapScores(binarySubst, binGapScores)
"""Distance between seq1 and seq2 using the substitution matrix subst"""
sim12 = np.array([i for i in map(lambda a, b: subst.get((a, b), subst.get((b, a))), seq1, seq2)])
if normed:
siteN = np.sum(~np.isnan(sim12))
sim11 = seq_similarity_old(seq1, seq1, subst=subst, normed=False)/siteN
sim22 = seq_similarity_old(seq2, seq2, subst=subst, normed=False)/siteN
sim12 = np.nansum(2*sim12/(sim11+sim22))
else:
sim12 = np.nansum(sim12)
return sim12
def seq_distance(seq1, seq2, subst = None, normed = True):
"""Compare two sequences and return the distance from one to the other
If the seqs are of different length then it raises an exception
Returns a scalar [0, siteN] where siteN ignores nan similarities which may depend on gaps
Optionally returns normed = True distance:
[0, 1]
Note that either way the distance is "normed", its either per site (True) or total normed (False):
[0, siteN]"""
return seq_similarity(seq1, seq2, subst = subst, normed = normed, asDistance = True)
def unalign_similarity(seq1,seq2,subst=None):
"""Compare two sequences by aligning them first with pairwise alignment
and return the distance from one to the other"""
if subst is None:
subst = blosum90
res = pairwise2.align.globaldx(seq1, seq2, subst)
return res[0][2]
def _test_seq_similarity(subst=None,normed=True):
def test_one(s, sname, n, seq1, seq2):
print(seq1)
print(seq2)
try:
sim = seq_similarity_old(seq1, seq2, subst=s, normed=n)
print('Similarity: %f' % sim)
except:
print('Similarity: %s [%s]' % (sys.exc_info()[0], sys.exc_info()[1]))
#dist = seq_distance(seq1,seq2,subst=s)
try:
dist = seq_distance(seq1, seq2, subst=s)
print('Distance: %f' % dist)
except:
print('Distance: %s [%s]' % (sys.exc_info()[0], sys.exc_info()[1]))
print()
seqs = ['AAAA',
'AAAA',
'KKKK',
'AAKA',
'-AAA',
'-A-A']
if subst is None:
subst = [addGapScores(binarySubst, binGapScores),
addGapScores(binarySubst, nanZeroGapScores),
addGapScores(blosum90, blosum90GapScores),
addGapScores(blosum90, nanGapScores)]
names = ['addGapScores(binarySubst,binGapScores)',
'addGapScores(binarySubst,nanZeroGapScores)',
'addGapScores(blosum90,blosum90GapScores)',
'addGapScores(blosum90,nanGapScores)']
for s, sname in zip(subst, names):
print('Using %s normed = %s' % (sname, normed))
for seq1, seq2 in itertools.combinations(seqs, 2):
test_one(s, sname, normed, seq1, seq2)
else:
for seq1, seq2 in itertools.combinations(seqs, 2):
test_one(subst, 'supplied subst', normed, seq1, seq2)
def calcDistanceMatrix(seqs,normalize=False,symetric=True,metric=None,**kwargs):
"""Returns a square distance matrix with rows and columns of the unique sequences in seqs
By default will normalize by subtracting off the min() to at least get rid of negative distances
However, I don't really think this is the best option.
If symetric is True then only calculates dist[i,j] and assumes dist[j,i] == dist[i,j]
Additional kwargs are passed to the distanceFunc (e.g. subst, gapScores, normed)
Parameters
----------
seqs : list/iterator
Genetic sequences to compare.
normalize : bool
If true (default: False), subtracts off dist.min() to eliminate negative distances
(Could be improved/expanded)
symetric : bool
If True (default), then it assumes dist(A,B) == dist(B,A) and speeds up computation.
metric : function with params seq1, seq2 and possibly additional kwargs
Function will be called to compute each pairwise distance.
kwargs : additional keyword arguments
Will be passed to each call of metric()
Returns
-------
dist : ndarray of shape [len(seqs), len(seqs)]
Contains all pairwise distances for seqs.
"""
return calcDistanceRectangle_old(seqs, seqs, normalize=normalize, symetric=symetric, metric=metric, **kwargs)
def calcDistanceRectangle_old(row_seqs,col_seqs,normalize=False,symetric=False,metric=None,convertToNP=False,**kwargs):
"""Returns a rectangular distance matrix with rows and columns of the unique sequences in row_seqs and col_seqs
By default will normalize by subtracting off the min() to at least get rid of negative distances
However, I don't really think this is the best option.
If symetric is True then only calculates dist[i,j] and assumes dist[j,i] == dist[i,j]
Additional kwargs are passed to the distanceFunc (e.g. subst, gapScores, normed)
Parameters
----------
row_seqs : list/iterator
Genetic sequences to compare.
col_seqs : list/iterator
Genetic sequences to compare.
normalize : bool
If true (default: False), subtracts off dist.min() to eliminate negative distances
(Could be improved/expanded)
symetric : bool
If True (default), then it assumes dist(A,B) == dist(B,A) and speeds up computation.
metric : function with params seq1, seq2 and possibly additional kwargs
Function will be called to compute each pairwise distance.
convertToNP : bool (default: False)
If True then strings are converted to np.arrays for speed,
but metric will also need to accomodate the arrays as opposed to strings
kwargs : additional keyword arguments
Will be passed to each call of metric()
Returns
-------
dist : ndarray of shape [len(row_seqs), len(col_seqs)]
Contains all pairwise distances for seqs.
"""
if not 'normed' in list(kwargs.keys()):
kwargs['normed'] = False
if metric is None:
metric = seq_distance
"""Only compute distances on unique sequences. De-uniquify with inv_uniqi later"""
row_uSeqs, row_uniqi, row_inv_uniqi = np.unique(row_seqs, return_index=True, return_inverse=True)
col_uSeqs, col_uniqi, col_inv_uniqi = np.unique(col_seqs, return_index=True, return_inverse=True)
if convertToNP:
R = [seq2vec(s) for s in row_uSeqs]
C = [seq2vec(s) for s in col_uSeqs]
else:
R = row_uSeqs
C = col_uSeqs
dist = np.zeros((len(row_uSeqs), len(col_uSeqs)))
for i, j in itertools.product(list(range(len(row_uSeqs))), list(range(len(col_uSeqs)))):
if not symetric:
"""If not assumed symetric, compute all distances"""
dist[i, j] = metric(R[i], C[j], **kwargs)
else:
if j<i:
tmp = metric(R[i], C[j], **kwargs)
dist[i, j] = tmp
dist[j, i] = tmp
elif j>i:
pass
elif j==i:
dist[i, j] = metric(R[i], C[j], **kwargs)
if normalize:
dist = dist - dist.min()
"""De-uniquify such that dist is now shape [len(seqs), len(seqs)]"""
dist = dist[row_inv_uniqi,:][:, col_inv_uniqi]
return dist
def calcDistanceRectangle(row_seqs, col_seqs, subst=None, nb_metric=None, normalize=False, symetric=False):
"""Returns a rectangular distance matrix with rows and columns of the unique sequences in row_seqs and col_seqs
By default will normalize by subtracting off the min() to at least get rid of negative distances
However, I don't really think this is the best option.
If symetric is True then only calculates dist[i,j] and assumes dist[j,i] == dist[i,j]
TODO:
(1) Wrap this function around dist rect functins below.
(2) Define a coverage nb_metric
(3) Come up with a back-up plan for when numba import fails...
(Not jit'ing is not a good option because it will be super slow!
There need to be numpy equivalent functions as back-up...)
Additional kwargs are passed to the distanceFunc (e.g. subst, gapScores, normed)
Parameters
----------
row_seqs : list/iterator
Genetic sequences to compare.
col_seqs : list/iterator
Genetic sequences to compare.
subst : dict or ndarray
Similarity matrix for use by the metric. Can be subst or substMat (i.e. dict or ndarray)
nb_metric : numba jit'd function with params seq_vecs1, seq_vecs2 (int8) and substMat (and others?)
Function will be called to compute each pairwise distance.
normalize : bool
If true (default: False), subtracts off dist.min() to eliminate negative distances
(Could be improved/expanded)
symetric : bool
If True (default), then it assumes dist(A,B) == dist(B,A) and speeds up computation.
Returns
-------
dist : ndarray of shape [len(row_seqs), len(col_seqs)]
Contains all pairwise distances for seqs.
"""
if not 'normed' in list(kwargs.keys()):
kwargs['normed'] = False
if metric is None:
metric = seq_distance
"""Only compute distances on unique sequences. De-uniquify with inv_uniqi later"""
row_uSeqs, row_uniqi, row_inv_uniqi = np.unique(row_seqs, return_index=True, return_inverse=True)
col_uSeqs, col_uniqi, col_inv_uniqi = np.unique(col_seqs, return_index=True, return_inverse=True)
if convertToNP:
R = [seq2vec(s) for s in row_uSeqs]
C = [seq2vec(s) for s in col_uSeqs]
else:
R = row_uSeqs
C = col_uSeqs
dist = zeros((len(row_uSeqs), len(col_uSeqs)))
for i, j in itertools.product(list(range(len(row_uSeqs))), list(range(len(col_uSeqs)))):
if not symetric:
"""If not assumed symetric, compute all distances"""
dist[i, j] = metric(R[i], C[j], **kwargs)
else:
if j<i:
tmp = metric(R[i], C[j], **kwargs)
dist[i, j] = tmp
dist[j, i] = tmp
elif j>i:
pass
elif j==i:
dist[i, j] = metric(R[i], C[j], **kwargs)
if normalize:
dist = dist - dist.min()
"""De-uniquify such that dist is now shape [len(seqs), len(seqs)]"""
dist = dist[row_inv_uniqi,:][:, col_inv_uniqi]
return dist
def distRect_factory(nb_metric):
"""Can be passed a numba jit'd distance function and
will return a jit'd function for computing all pairwise distances using that function"""
@nb.jit(nb.boolean(nb.float64[:,:], nb.int8[:,:], nb.int8[:,:], nb.float64[:,:], nb.boolean), nopython=True)
def nb_distRect(pwdist, rows, cols, substMat, symetric):
n = rows.shape[0]
m = cols.shape[0]
for i in range(n):
for j in range(m):
if not symetric:
pwdist[i, j] = nb_seq_similarity(rows[i,:], cols[j,:], substMat=substMat, normed=False, asDistance=True)
else:
if j<=i:
pwdist[i, j] = nb_seq_similarity(rows[i,:], cols[j,:], substMat=substMat, normed=False, asDistance=True)
pwdist[j, i] = pwdist[i, j]
return True
return nb_distRect
def distRect(row_vecs, col_vecs, substMat, nb_metric, normalize=False, symetric=False):
"""These conversion will go in a wrapper function with the uniquing business
if subst is None:
substMat = subst2mat(addGapScores(binarySubst,binGapScores))
else:
substMat = subst2mat(subst)
if nb_metric is None:
nb_metric = nb_seq_similarity
row_vecs = seqs2mat(row_seqs)
col_vecs = seqs2mat(col_seqs)"""
nb_drect = distRect_factory(nb_metric)
pwdist = np.zeros((row_vecs.shape[0], col_vecs.shape[0]), dtype=np.float64)
success = nb_drect(pwdist, row_vecs, col_vecs, substMat, symetric)
assert success
if normalize:
pwdist = pwdist - pwdist.min()
return pwdist
#@jit()
def coverageDistance(epitope,peptide, mmTolerance = 1,**kwargs):
"""Determines whether pepitde covers epitope
and can handle epitopes and peptides of different lengths.
To be a consistent distance matrix:
covered = 0
not-covered = 1
If epitope is longer than peptide it is not covered.
Otherwise coverage is determined based on a mmTolerance
Can accomodate strings or np.arrays (but not a mix).
Parameters
----------
epitope : str or np.array
peptide : str or np.array
mmTolerance : int
Number of mismatches tolerated
If dist <= mmTolerance then it is covered
Returns
-------
covered : int
Covered (0) or not-covered (1)"""
tEpitope, tPeptide = type(epitope), type(peptide)
assert tEpitope == tPeptide
LEpitope, LPeptide = len(epitope), len(peptide)
if LEpitope > LPeptide:
return 1
if isinstance(epitope, str):
min_dist = array([np.sum([i for i in map(operator.__ne__, epitope, peptide[starti:starti+LEpitope])]) for starti in range(LPeptide-LEpitope+1)]).min()
else:
min_dist = array([(epitope != peptide[starti:starti+LEpitope]).sum() for starti in range(LPeptide-LEpitope+1)]).min()
return 0 if min_dist <= mmTolerance else 1
def embedDistanceMatrix(dist,method='tsne'):
"""MDS embedding of sequence distances in dist, returning Nx2 x,y-coords: tsne, isomap, pca, mds, kpca"""
if method == 'tsne':
xy = tsne.run_tsne(dist, no_dims=2)
#xy=pytsne.run_tsne(adist,no_dims=2)
elif method == 'isomap':
isoObj = Isomap(n_neighbors=10, n_components=2)
xy = isoObj.fit_transform(dist)
elif method == 'mds':
mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=15,
dissimilarity="precomputed", n_jobs=1)
xy = mds.fit(dist).embedding_
rot = PCA(n_components=2)
xy = rot.fit_transform(xy)
elif method == 'pca':
pcaObj = PCA(n_components=2)
xy = pcaObj.fit_transform(1-dist)
elif method == 'kpca':
pcaObj = KernelPCA(n_components=2, kernel='precomputed')
xy = pcaObj.fit_transform(1-dist)
elif method == 'lle':
lle = manifold.LocallyLinearEmbedding(n_neighbors=30, n_components=2, method='standard')
xy = lle.fit_transform(dist)
return xy