-
Notifications
You must be signed in to change notification settings - Fork 1
/
qMS.py
1276 lines (1070 loc) · 50.8 KB
/
qMS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
.. module:: qMS
:platform: Any
:synopsis: A collection of functions for qMS data processing and analysis
.. moduleauthor:: Joey Davis <joeydavis@gmail.com>; Josh Silverman <josh.silverman@gmail.com>
"""
import os
import csv
import sys
import urllib2
import numpy
import qMSDefs
import pandas as pd
import re
#qMS utilities
path = os.getcwd()
sys.setrecursionlimit(10000000)
[startindex, pep_seq_index, exp_mr_index, csvopen] = [0, 0, 0, ''];
referencePath = '/home/jhdavis/scripts/python/modules/qMSmodule/'
#######Nice sorting and printing utilities######
def tryint(s):
try:
return int(s)
except:
return s
def alphanum_key(s):
""" Turn a string into a list of string and number chunks.
"z23a" -> ["z", 23, "a"]
"""
return [ tryint(c) for c in re.split('([0-9]+)', s) ]
def sort_nicely(l):
""" Sort the given list in the way that humans expect.
"""
l.sort(key=alphanum_key)
return l
def calculateMedian(statsDictDict, orderedListOfDictKeys, subunits, defaultValue=0.0):
proteinList = {}
for p in subunits:
proteinList[p] = []
for n in orderedListOfDictKeys:
for p in subunits:
try:
proteinList[p].append(getProtMedValue(statsDictDict[n], p))
except KeyError:
proteinList[p].append(defaultValue)
return proteinList
def readDataFile(filename, scale=1.0, delimiter='\t'):
"""readDataFile reads a datafile. The datafile should be tab separated and have both column and row headers listing the proteins/fractions.
Empty values should be empty, they will be treated as np.NAN - see lambda below
readDataFile takes an optional scale that will multiply the data by the specified factor - useful for "scaling" error
:param filename: a string of the filename to be read
:type filename: string
:param scale: a float indicating how to scale the data, default is unscaled (1)
:type scale: float
:returns: a data ditionary. 'data', 'fractions', 'proteins' are filled in, others are None
"""
with open(filename, 'rb') as inputFile:
csvFile = list(csv.reader(inputFile, delimiter = delimiter))
header = csvFile[0]
proteins = [row[0] for row in csvFile[1:]]
#insertValue2 = lambda x, y: numpy.NAN if x=='' else float(x)*y
insertValue2 = lambda x, y: 0.0 if x=='' else float(x)*y
data = numpy.array([[insertValue2(col, scale) for col in row[1:]] for row in csvFile[1:]])
inputFile.close()
return {'fractions':header[1:], 'proteins':proteins, 'fi':None, 'pi':None, 'data':data}
def readIsoCSV(filename, columns=None, noProcess=False):
"""readIsoCSV takes a filename pointing to a _iso.csv file. It returns the calculated
pandas dataFrame. Optional argument columns can be used to specify specific column headers
:param filename: full path to the _iso.csv file
:type filename: string
:param columns: optional list of strings with the columns to incorporate into the dataFrame
:type columns: list of strings
:returns: a pandas dataFrame with the relevant contents of the _iso.csv. Function
automatically determines if the dataset is a pulse (bears AMP_S) or variable labeling (bears FRC_NX)
"""
if columns is None:
r = csv.reader(open(filename))
header = r.next()
pulse = 'AMP_S' in header
varLab = 'FRC_NX' in header
al = 'AMP_L' in header
allClear = 'allClear' in header
origin = 'originFile' in header
TID = 'TID' in header
UID = 'UID' in header
priorFilt = 'priorFilter' in header
allFPass = 'allFPass' in header
shortName = 'shortName' in header
if noProcess:
columns=['isofile', 'isoprotein', 'isopep', 'mw', 'isoz_charge', 'tim',
'chisq', 'symb', 'mz', 'B', 'OFF', 'GW', 'AMP_U',
'rt_n14', 'rt_n15', 'mz_n14', 'mz_n15',
'ppm_n14', 'ppm_n15', 'n14mass', 'n15mass', 'protein', 'startres',
'endres', 'charge', 'missed', 'seq', 'mod', 'seqmod', 'file']
else:
columns=['isofile', 'isoprotein', 'isopep', 'mw', 'isoz_charge', 'tim',
'chisq', 'symb', 'mz', 'B', 'OFF', 'GW', 'AMP_U',
'rt_n14', 'rt_n15', 'mz_n14', 'mz_n15',
'ppm_n14', 'ppm_n15', 'n14mass', 'n15mass', 'protein', 'startres',
'endres', 'charge', 'missed', 'seq', 'mod', 'seqmod', 'file',
'resid', 'minIntensity', 'ratio', 'currentCalc',
'70Spos', '50Spos', '30Spos', 'otherpos', 'currentPos',
'ppmDiff', 'rtDiff', 'handDelete', 'handSave']
if pulse:
columns.append('AMP_S')
if varLab:
columns.append('FRC_NX')
if al:
columns.append('AMP_L')
if origin:
columns.append('originFile')
if allClear:
columns.append('allClear')
if TID:
columns.append('TID')
if UID:
columns.append('UID')
if priorFilt:
columns.append('priorFilter')
if allFPass:
columns.append('allFPass')
if shortName:
columns.append('shortName')
ttof = 'matchmodex' in header
if ttof:
columns.append('pks_orig_rt_n14')
columns.append('pks_orig_rt_n15')
data = pd.read_csv(filename, usecols=columns)
if not pulse:
data = data.rename(columns={'AMP_L': 'AMP_S'})
positionOtherDict = {key:int(value)+1 for value, key in enumerate(sort_nicely(sorted(set(data['protein'].values))))}
positionLookupOther = pd.Series(positionOtherDict)
data['70Spos']=qMSDefs.positionLookup70S[data['protein']].values
data['50Spos']=qMSDefs.positionLookup50S[data['protein']].values
data['30Spos']=qMSDefs.positionLookup30S[data['protein']].values
data['otherpos']=positionLookupOther[data['protein']].values
data['currentPos']=data['otherpos']
data['ppmDiff']=data['ppm_n14'] - data['ppm_n15']
data['rtDiff']=data['rt_n14'] - data['rt_n15']
if not allClear:
data['allClear'] = True
data['handDelete'] = False
data['handSave'] = False
if not priorFilt:
data['priorFilter'] = True
if not allFPass:
data['allFPass'] = True
if not origin:
data['originFile'] = filename
if ttof:
data['missed'] = data.apply(lambda row: calcMissedTTOF(row), axis=1)
data['ppm_n14'] = data.apply(lambda row: calcPPMTTOF(row), axis=1)
data['rtDiff'] = data.apply(lambda row: calcRTDiffTTOF(row), axis=1)
data['ppm_n15'] = data['ppm_n14']
data['ppmDiff'] = data['ppm_n14'] - data['ppm_n14'].median()
return data
def calcRTDiffTTOF(row, origField = 'pks_orig_rt_n14', finalField='rt_n14'):
return row[origField] - row[finalField]
def calcPPMTTOF(row, field='mz_n14', useCharge=True):
try:
return 1e6*(row['OFF']/row[field])
except TypeError:
return -999
def calcMissedTTOF(row, cutters = ['R', 'K'], field='seq'):
"""calcMissed looks at the sequence and calculates the number of missed cleavages.
If there is a non-correct C terminus, it returns -1 (non tryptic, for example)
:param row: a pandas series that must contain a the sequence in the correct field
:type row: pandas series
:param cutters: an array with the letters to look for in the sequence as missed cleavages.
:type cutters: an array of strings
:param field: a string with the index name for the field to search.
:type field: string
"""
totalMissed = 0
for c in cutters:
totalMissed+=row[field].count(c)
return totalMissed-1
def subtractDoubleSpike(refDF, dataDF, num='AMP_U', den='AMP_S'):
"""subtractDoubleSpike takes two pandas dataFrames, it first divides AMP_U by AMP_S.
It then finds the median value for the AMP_U/S field on a protein by protein basis from the reference set.
This is subtracted from the experimetnal fields, and they are mupliplied back by AMP_S to give the proper value.
A corrected DoubleSpikeDF is returned
:param refDF: a pandas dataFrame as calculated from an _iso.csv file (qMS.readIsoCSV).
:type refDF: pandas dataFrame.
:param dataDF: a list of strings identifying the numerator (must be AMP_U, AMP_L, and/or AMP_S).
:type dataDF: list of strings.
:returns: a pandas DF that has been corrected for a double spike. Should look the same as the input
dataDF with the AMP_U fields now corrected.
"""
dataDF[num] = dataDF[num]/dataDF[den]
refDF[num] = refDF[num]/refDF[den]
for i in dataDF.index:
p = dataDF.ix[i]['protein']
med = refDF[refDF['protein']==p][num].median()
dataDF.ix[i,num] = max(dataDF.ix[i][num] - med, 0.0)
dataDF[num] = dataDF[num]*dataDF[den]
refDF[num] = refDF[num]*refDF[den]
return dataDF
def correctFileForDoubleSpike(expPath, refDF=None, refPath=None, num='AMP_U', den='AMP_S'):
"""correctFileForDoubleSpike takes a path to a dataset to be corrected and either a reference dataframe
or a string pointing to the iso_csv file for the reference set (the double spike alone dataset).
It makes a pandas DF out of the experimental data, corrects them for the double spike, and returns
the corrected pandas dataframe
:param expPath: a path to the experimental dataset
:type expPath: strng
:param refDF: a pandas dataframe of the reference dataset
:type refDF: pandas dataframe
:param refPath: a string pointing to the reference datapath
:type refPath: a string
:returns: a pandas DF that has been corrected for a double spike.
"""
if refDF is None:
refDF = readIsoCSV(refPath)
currentDF = readIsoCSV(expPath)
currentDF = subtractDoubleSpike(refDF, currentDF, num=num, den=den)
return currentDF
def correctListOfFiles(refPath, listOfFiles, extension=None, savePath=None, num='AMP_U', den='AMP_S'):
"""correctListOfFiles takes a path to a reference set (double spike alone) and a list of paths to the files to be corrected.
It makes pandas DFs out of the list of files, corrects them for the double spike, and returns a dictionary
with the keys as the path to the file and the value as the corrected pandas dataframe
:param refPath: a path to the reference dataset
:type refPath: strng
:param listOfFiles: a list of paths to the files to be corrected
:type listOfFiles: list of strings.
:returns: a dictionary of a pandas DF that has been corrected for a double spike. Each key is the dict
provided in the listOfFiles
"""
refDF = readIsoCSV(refPath)
DFDict = {}
for n,i in enumerate(listOfFiles):
currentDF = correctFileForDoubleSpike(i, refDF=refDF, refPath=None, num=num, den=den)
DFDict[i] = currentDF.copy()
if not (extension is None):
if (savePath is None):
currentDF.to_csv(i+extension, index=False)
else:
fileName = i.split('/')[-1]
currentDF.to_csv(savePath+fileName+extension, index=False)
return DFDict
def generateDFDict(listOfFiles):
"""generateDFDict takes list of paths to the files that will generate the dictionary of dataframes.
It makes pandas DFs out of the list of files, and returns a dictionary
with the keys as the path to the file and the value as the pandas dataframe
:param listOfFiles: a list of paths to the files to be corrected
:type listOfFiles: list of strings.
:returns: a dictionary of a pandas DF. Each key is the dict
provided in the listOfFiles
"""
DFDict = {i:readIsoCSV(i) for i in listOfFiles}
return DFDict
def openListOfFiles(listOfFiles):
"""openListOfFiles takes a list of files and resturns a dictionary of pandas dataframes with the contents
:param listOfFiles: a list of paths to the files to be corrected
:type listOfFiles: list of strings.
:returns: a dictionary of a pandas DF that has been corrected for a double spike. Each key is the dict
provided in the listOfFiles
"""
DFDict = {}
for n,i in enumerate(listOfFiles):
currentDF = readIsoCSV(i)
DFDict[i] = currentDF.copy()
return DFDict
def unity(x):
return x
def calcStatsDict(dataFrame, numerator, denominator, normalization=1.0, offset=0.0, func=unity, adjProt=None):
"""calcStatsDict takes a dataFrame and , a numerator, a denominator, an offset (applied to value first)
and a normalization factor (scaling factor applied last). It returns a dictionary with keys of
protein names and values as a numpy array of calculated values based on numerator and denominator keys.
:param dataFrame: a pandas dataFrame as calculated from an _iso.csv file (qMS.readIsoCSV)
:type dataFrame: pandas dataFrame
:param numerator: a list of strings identifying the numerator (must be AMP_U, AMP_L, and/or AMP_S)
:type numerator: list of strings
:param denominator: a list of strings identifying the numerator (must be AMP_U, AMP_L, and/or AMP_S)
:type denominator: list of strings
:param normalization: a float normalization factor if you want to scale all of the values uniformly (applied last)
:type normalization: float
:param offset: a float offset factor if you want to offset the values (applied first)
:type offset: float
:returns: a dictionary of numpy arrays. First key is the protein name (one of those given in protein_set).
This leads to a numpy array with the list of values
"""
ps = list(set(dataFrame['protein'].values))
ps.sort()
toReturn = {p:calcValue(dataFrame[dataFrame['protein']==p], numerator, denominator, offset=offset, func=func).values*normalization for p in ps}
if not (adjProt is None):
toReturn[adjProt[0]] = toReturn[adjProt[0]]*adjProt[1]
return toReturn
def multiStatsDictFromDF(dFDict, num, den, namesList=None, normalization=1.0, offset=0.0, normProtein=None, adjProt=None):
"""multiStatsDictFromDF takes a list of dataframesand a list of nums and dens.
It returns a dict of dict (first key is the file name) - this leads to a statsDict
that is keyed by protein names. All of the statsDicts contain a full compliment of keys
(based on the file list), with empty numpy arrays if there were no values in the original
dataset
:param dFDict: a dict of pandas dataframes (keys are the names of the dataframes - output of qMS.correctListOfFiles)
:type dFDict: a dict of pandas dataframes
:param num: a list of strings identifying the numerator (must be AMP_U, AMP_L, and/or AMP_S)
:type num: list of strings
:param den: a list of strings identifying the numerator (must be AMP_U, AMP_L, and/or AMP_S)
:type den: list of strings
:param namesList: a list of strings identifying the keys in the dfDict to use
:type namesList: list of strings
:param normalization: a float normalization factor if you want to scale all of the values uniformly
:type normalization: float
:param offset: a float offset factor if you want to alter the values uniformly
:type offset: float
:param normProtein: string of the protein to normalize to (will be to the median)
:type normProtein: string
:returns: a dictionary of of dicationaries of numpy arrays. First key is the file name, this leads
to a dictionary where the first key is the protein name (one of those given in protein_set).
This leads to a numpy array with the list of values
"""
if namesList is None:
namesList = dFDict.keys()
allPs = calcStatsDict(dFDict[namesList[0]], num, den)
dFStatsDict = dict()
for name in namesList:
df = dFDict[name]
dFStatsDict[name] = calcStatsDict(df, num, den, normalization=normalization, offset=offset)
if not (normProtein is None):
normValue = 1/numpy.median(dFStatsDict[name][normProtein])
dFStatsDict[name] = calcStatsDict(df, num, den, normalization=normValue, offset=offset)
for name in namesList[1:]:
allPs = appendKeys(allPs, dFStatsDict[name])
for name in namesList:
dFStatsDict[name] = appendKeys(dFStatsDict[name], allPs)
if not (adjProt is None):
for name in namesList:
dFStatsDict[name][adjProt[0]] = dFStatsDict[name][adjProt[0]]*adjProt[1]
return dFStatsDict
def multiStatsDict(isoFileList, num, den, normalization=1.0, offset=0.0, normProtein=None, noProcess=False, adjProt=None):
"""multiStatsDict takes a list of _iso.csv files and a list of nums and dens.
It returns a dict of dict (first key is the file name) - this leads to a statsDict
that is keyed by protein names. All of the statsDicts contain a full compliment of keys
(based on the file list), with empty numpy arrays if there were no values in the original
dataset
:param isoFileList: a list of file paths (full paths with entire file name)
:type isoFileList: a list of strings
:param num: a list of strings identifying the numerator (must be AMP_U, AMP_L, and/or AMP_S)
:type num: list of strings
:param den: a list of strings identifying the numerator (must be AMP_U, AMP_L, and/or AMP_S)
:type den: list of strings
:param normalization: a float normalization factor if you want to scale all of the values uniformly
:type normalization: float
:param offset: a float offset factor if you want to alter the values uniformly
:type offset: float
:param normProtein: string of the protein to normalize to (will be to the median)
:type normProtein: string
:returns: a dictionary of of dicationaries of numpy arrays. First key is the file name, this leads
to a dictionary where the first key is the protein name (one of those given in protein_set).
This leads to a numpy array with the list of values
"""
dFDict = {}
for i in isoFileList:
dFDict[i] = readIsoCSV(i, noProcess=noProcess)
return multiStatsDictFromDF(dFDict, num, den, namesList=isoFileList, normalization=normalization, offset=offset, normProtein=normProtein, adjProt=adjProt)
def mergeFiles(fileList, numerator, denominator, normProtein=None):
"""mergeFiles takes a list of _iso.csv files and returns a merged statsFile data dictionary structure
:param fileList: a list of strings with the full path for reach .csv file
:type fileList: list (of strings)
:param numerator: a list of strings for what elements in the numerator (ampu, ampl, amps)
:type numerator: list of strings
:param denominator: a list of strings for what elements in the numerator (ampu, ampl, amps)
:type denominator: list of strings
:param proteinToNormalizeTo: string of what protein to normalize to (defaults to None)
:type proteinToNormalizeTo: strings
:returns: the median value in the 'vals' field for that protein
"""
splitDict = multiStatsDict(fileList, numerator,denominator, normProtein=normProtein)
mergedDict = splitDict[sorted(splitDict.keys())[0]]
for k in sorted(splitDict.keys())[1:]:
cd = splitDict[k]
for p in sorted(cd.keys()):
mergedDict[p] = numpy.concatenate((mergedDict[p],cd[p]))
return mergedDict
def getProtMedValue(statsFileDict, proteinName):
"""getProtMedValue gets the median value of proteins values from a StatsFile-style dictionary
:param statsFileDict: a statsFile-style dictionary (can be generated by calcStatsDict
(which takes a pandas dataFrame from readIsoCSV))
:type statsFileDict: dictionary (statsFilte type (needs keys of protein names (strings) and values of numpy arrays))
:param proteinName: name of the protein to get the med from
:type proteinName: string
:returns: the median value
"""
return numpy.median(statsFileDict[proteinName])
def calcPercent(f, sigfig=2):
"""calcPercent takes a floating point number, rounds it to the number of sigfigs (default 2) and
returns a string of that number multiplied by 100 and with a ' %'
:param f: a floating point number to be converted to percentage
:type f: float
:param sigfig: the number of sigfigs to round to (default 2)
:type sigfig: int
:returns: a string of the float rounded + ' %'
"""
s = str(round(f, sigfig)*100)
return s+" %"
def maxLabFunc(k,t):
"""maxLabFunc is a function to calculate the max labeling based on the equation from Stephen Chen's paper
:param k: the growth rate (calculated as ln(2)/doubling time)
:type k: float
:param t: the time (if a single number, you get back teh max lab at that point),
can accept an array and will give back the curve
:type t: float/int/array
:returns: the max lab (array or single value)
"""
return 1 - numpy.exp(-k*t)
def poolFunc(k,t,P):
"""poolFunc is a function to calculate the labeling kinetics for a protein with a given pool size
Equation from Stephen Chen's paper. USE THIS TO FIT THE LABELING OF TERMINAL (70S) pool.
:param k: the growth rate (calculated as ln(2)/doubling time)
:type k: float
:param t: the time (if a single number, you get back teh max lab at that point),
can accept an array and will give back the curve
:type t: float/int/array
:param P: The pool size (expressed as precursorPool/completedRibosomePool)
:type t: float
:returns: the expected labeling (array or single value)
"""
return 1.0 + (P*numpy.exp((0.0-k)*(1.0+(1.0/P))*t)) - ((1.0 + P)*numpy.exp((0.0-k)*t))
def poolInterFunc(k,t,P):
"""poolInterFunc is a function to calculate the labeling kinetics for a protein using the
overlabeling of an intermediate. Derived from the differential equation in Stephen Chen's paper
:param k: the growth rate (calculated as ln(2)/doubling time)
:type k: float
:param t: the time (if a single number, you get back teh max lab at that point),
can accept an array and will give back the curve
:type t: float/int/array
:param d: the turnover rate
:type d: float
:returns: the expected labeling lab (array or single value)
"""
return 1.0 - (numpy.exp((0.0-k)*(1.0+(1.0/P))*t))
def poolInterFracXFunc(k,t,P,X=0.65):
"""poolInterFunc is a function to calculate the labeling kinetics for a protein using the
overlabeling of an intermediate. Derived from the differential equation in Stephen Chen's paper
:param k: the growth rate (calculated as ln(2)/doubling time)
:type k: float
:param t: the time (if a single number, you get back teh max lab at that point),
can accept an array and will give back the curve
:type t: float/int/array
:param d: the turnover rate
:type d: float
:returns: the expected labeling lab (array or single value)
"""
return X*poolInterFunc(k,t,P) + (1.0-X)*poolFunc(k,t,P)
def overLabelingFunc(k,t,d):
"""overLabelingFunc is a function to calculate the labeling kinetics for a protein with a given turnover rate
Equation from Stephen Chen's paper
:param k: the growth rate (calculated as ln(2)/doubling time)
:type k: float
:param t: the time (if a single number, you get back teh max lab at that point),
can accept an array and will give back the curve
:type t: float/int/array
:param d: the turnover rate
:type d: float
:returns: the expected labeling lab (array or single value)
"""
return 1.0 - (numpy.exp(-(d+k)*t))
def growthCurve(doublingTime, t, Ai):
"""growthRate calculates a growth rate k given a doubling time (units of k can be used in equations)
listed in Stephen Chen's paper
:param doublingTime: the doubling time of the cell in units of time (mins, secs)
:type doublingTime: floatz
:returns: the growth rate k (units of inverse time) - calculated at ln(2)/k
"""
return Ai*numpy.exp2(t/float(doublingTime))
def growthRate(doublingTime):
"""growthRate calculates a growth rate k given a doubling time (units of k can be used in equations)
listed in Stephen Chen's paper
:param doublingTime: the doubling time of the cell in units of time (mins, secs)
:type doublingTime: floatz
:returns: the growth rate k (units of inverse time) - calculated at ln(2)/k
"""
return numpy.log(2)/float(doublingTime)
def addBlankKey(d, k):
"""addBlankKey takes a stats dict and a key. It checks if the key in in the stats file,
if it is not, it make a new dict entry with the key k and the value as an empty numpy array
:param d: statsDictionary
:type d: a dict of numpy arrays - keyed by protein names
:param k: a string name for the key to check
:type k: string
:returns: statsDictionary (a dictionary of numpy arrays). First key is the protein name (one of those given in protein_set).
This leads to a numpy array with the list of values
"""
if not d.has_key(k):
d[k]=numpy.array([])
return d
def appendKeys(d1, d2):
"""appendKeys a helper function that adds all the keys in d2 to d1 (making empty entries into d1)
:param d1: statsDictionary
:type d1: a dict of numpy arrays - keyed by protein names
:param d2: statsDictionary
:type d2: a dict of numpy arrays - keyed by protein names
:returns: statsDictionary (a dictionary of numpy arrays). First key is the protein name (one of those given in protein_set).
This leads to a numpy array with the list of values
"""
for k in d2:
d1 = addBlankKey(d1, k)
return d1
def calcValue(df, num, den, offset=0.0, func=unity):
"""calcVale takes a pandas dataFrame bearing the keys to be used and calculates
the ratio of the num/den (specified as lists of the keys - AMP_U, AMP_L, AMP_S)
:param df: pandas dataFrame with the information from an _iso.csv
:type df: pandas dataFrame
:param num: a list of the strings in the numerator (AMP_U, AMP_L, AMP_S)
:type den: list of strings
:param den: a list of strings identifying the denominator (AMP_U, AMP_L, AMP_S)
:type den: list of strings
:param offset: an optional offset float to move the point up or down
:type offset: float
:returns: a float of the [(sum of the nums) divided by the (sum of the dens)] + the offset
"""
nsDF = df[num[0]]
dsDF = df[den[0]]
for x in num[1:]:
nsDF = nsDF + df[x]
for x in den[1:]:
dsDF = dsDF + df[x]
try:
value = nsDF/dsDF + offset
except TypeError:
print "Error in calculating values - some entry must contain strings."
print "This can be fixed by deleting this row in vi (you'll see a bunch of NaN values there)."
print "Until this is fixed, all values set to 0.0"
value = 0.0
return func(value)
def boolParse(s):
"""boolParse takes a string and returns a bool. Any capitilization of "true" results in True
all other strings result in False
:param s: string to process
:type s: string
:returns: bool (see description)
"""
return s.upper()=='TRUE'
def preProcessIsoCSV(isoPath, genPlots=True):
"""preProcessIsoCSV processes an _iso.csv and _plots directory to append the
residual column as well as a "currentCalc" column showing protein levels.
Function is a helper that calls calcResidual
:param isoPath: full path to the _iso.csv file
:type isoPath: string
:param genPlots: bool indicating if .plots files should be generated
:type genPlots: list of strings
:returns: a complteted dataFrame with resids and currentCalc columns. Has
externality of generating .plots files in the _peaks directory if genPlots is true
"""
print "reading : " + isoPath + "..."
dataFrame = readIsoCSV(isoPath, noProcess=True)
dataFrame['currentCalc'] = calcValue(dataFrame, ['AMP_U'], ['AMP_U', 'AMP_S'])
dataFrame['ratio'] = calcValue(dataFrame, ['AMP_U'], ['AMP_S'])
rootPath = '/'.join(isoPath.split('/')[:-1])+'/'
dataFrame = calcResidual(rootPath, dataFrame, genPlots=genPlots)
fileName = isoPath.split('/')[-1:][0].replace('.', '_res.')
print "writing : " + fileName + "..."
dataFrame.to_csv(rootPath+fileName, index=False)
return dataFrame
def cleanPlotsDir(path, extensions=['.newcon', '.fit', '.png']):
"""cleanPlotsDir executes a system command to remove files from a _peaks directory
:param path: full path to the _peaks directory (no trailing /)
:type path: path
:param extensions: list of strings with extension types
:type extensions: list of strings
:returns: no return, outputs the command issued and any output from the command.
Has externalities of deleting all *.ext files in the path directory
"""
for ext in extensions:
command = 'rm -rf ' + path + '/*' + ext
print command
output = os.popen(command).read()
print output
def calcResidual(datapath, dataFrame, genPlots=False):
"""calcResidual takes a path to a _peaks directory and a pandas dataFrame containing the contents
of the _iso.csv file. It appends a column to the dataFrame with the calculated residual
(the difference between the fit and data in a .dat file) for each peptide. It also appends a
column to the dataFrame with the max fit intensity
Optional paramter genPlots will also generate .plots files that can be used to plot the datasets
:param datapath: A string with the full path to the _plots directory
:type datapath: string
:param dataFrame: A pandas dataframe with ['isofile'] at the minimum (must point to the .dat files in
the _plots directory)
:type dataFrame: pandas dataframe
:param genPlots: Optional boolean telling function if it should write .plots files
:type genPlots: boolean
:returns: the dataFrame modified to include the 'resid' and 'minIntensity' columns. .Dats that cause any errors
are given fits with constant 666, resids with constant 666, and minIntensity with constant -666.
"""
dataFrame['resid']=0
dataFrame['minIntensity']=0
for iso in dataFrame['isofile'].values:
datFileName = datapath+iso+'.dat'
try:
datPd = pd.read_csv(datFileName, names=['offset', 'dat', 'resid'], header=None)
del datPd['offset']
datPd['residAdj'] = datPd['resid']+(datPd['dat'].median()-datPd['resid'].median())
datPd['fit'] = datPd['residAdj']+datPd['dat']
calcResid = datPd['residAdj'].abs().sum()/min([datPd['fit'].max(), datPd['dat'].max()])
calcMinIntensity = min([datPd['fit'].max(), datPd['dat'].max()])
except (IOError, TypeError, Exception) as e:
print "Error " + e.message + " in " + datFileName
sys.stdout.flush()
datPd = pd.DataFrame({'fit' : 666}, index=[0])
datPd['residAdj'] = 666
calcResid = 666
calcMinIntensity = -666
#row = dataFrame[dataFrame['isofile']==iso]
#row['resid'] = calcResid
#row['minIntensity'] = calcMinIntensity
#dataFrame.update(row)
rowIX = dataFrame[dataFrame['isofile']==iso].index.values[0]
dataFrame.loc[rowIX, 'resid'] = calcResid
dataFrame.loc[rowIX, 'minIntensity'] = calcMinIntensity
if genPlots:
datPd.to_csv(datapath+iso+'.plots', index=False)
return dataFrame
def getRPSeqData(ID):
"""getRPSeqData is a helper function to get the AA and cDNA seqs for ribosomal proteins
:param ID: string with the geneID
:type ID: string
:returns: a list of strings, first is by cDNA, second by AA;
"""
address = 'http://ribosome.med.miyazaki-u.ac.jp/rpg.cgi?id='+ID+'&mode=seq'
website = urllib2.urlopen(address)
website_html = website.read()
cDNAi = website_html.find('>cDNA Sequence')
cDNAj = website_html.find('</textarea></td></tr>\n<tr><td align="center" bgcolor="#FFFF80" width="150">Amino Acids Sequence</td><td>')
cDNA = website_html[cDNAi+74:cDNAj-1]
AAj = website_html.find('</textarea></td></tr>\n</table>\n<div class="footer">')
AA = website_html[cDNAj+155:AAj]
return [cDNA, AA]
def getRPInfo(numberGenes, start=10, baseURL='http://ribosome.med.miyazaki-u.ac.jp/rpg.cgi?mode=gene&id=ECO100'):
"""getRPInfo generates two dictionaries with relevant ribosomal protein information from a database.
It prefers the url listed above, but can be used with other organisms so long as the
find commands still work
:param numberGenes: the number of genes to look for
:type numberGenes: int
:param baseURL: a string pointing to the base url, defaults to the japanese database
:type baseURL: string
:returns: a list of dictionaries, first is by geneNames, second by geneProduct;
each dictionary is a dict of dicts with subkeys size, cDNA, AA and either GP or GN (opposite
of the base key)
"""
#gs = ['01', '02', '03', '04', '05', '06', '07', '08', '09']
genes = range(start, start+numberGenes)
#genes = gs + genes
base = baseURL
addys = [base + str(i) for i in genes]
print addys
rpdictGN = {}
rpdictGP = {}
for address in addys:
website = urllib2.urlopen(address)
website_html = website.read()
gni = website_html.find('>Gene Name</td><td>')
gn = website_html[gni+19:gni+26]
gpi = website_html.find('ibosomal protein ')
gp = website_html[gpi+17:gpi+20]
gp = gp.upper()
if gp[-1] == '<':
gp = gp[:-1]
if gp == 'ITL':
gp = 'L9'
if gp == 'L7/':
gp = 'L7/L12'
si = website_html.find('Gene Size [bp]</td><td>')
s = website_html[si+23:si+27]
if s[-1] == '<':
s = s[:-1]
cDNA, AA = getRPSeqData(address[-8:])
rpdictGN[gn] = {'GP':gp, 'size':s, 'cDNA':cDNA, 'AA':AA}
rpdictGP[gp] = {'GN':gn, 'size':s, 'cDNA':cDNA, 'AA':AA}
return [rpdictGN, rpdictGP]
#fetches protein sequence from uniprot ID
def getsequence(uniprot):
try:
urlbase = "http://www.uniprot.org/uniprot/"
req = urllib2.Request(urlbase + uniprot + ".fasta")
response = urllib2.urlopen(req)
lines = [x for i, x in enumerate(response) if i > 0]
return "".join(map(lambda x: x.rstrip("\n"), lines))
except urllib2.URLError:
print "No internet connection or bad Uniprot id"
#Returns the MAD of a list
def MAD(list):
temp_median = numpy.median(list)
temp_absdevs = map(lambda item: abs(item - temp_median), list)
return numpy.median(temp_absdevs)
def listReplace(l, to, rv):
"""listReplace is a helper function replaces all occurances of to with rv in a list l
:param l: list to be replaced
:type l: list
:param to: item to be replaced
:type to: string
:param rv: item to replace with
:type rv: string
:returns: a list with all occurances of to replaced with rv
"""
tr = []
for i in l:
if i == to:
tr.append(rv)
else:
tr.append(i)
return tr
def printSortedDict(d):
"""printSortedDict is a helper function to print a dictionary
:param d: a dictionary to be printed
:type d: dict
:returns: a string of the dictionary
"""
k = d.keys()
k.sort()
tp = ''
for i in k:
tp = tp + str(i) + ":" + str(d[str(i)]) + ", "
return tp
def calcMW(seq):
"""calcMW is a helper function that calculates the mass of an AA sequence
:param seq: the sequence to be calculated
:type seq: string
:returns: a float with the mass of the sequence
"""
mw = 0.0
for aa in seq:
mw = mw+qMSDefs.aaweights[aa.upper()]
return mw
def dropDuplicatesPandas(df, cols_to_consider=None):
"""dropDuplicatesPandas is a helper to remove duplicated rows based on a set
of columns that must all contain identical information.
:param df: the dataframe to be inspected
:type df: a pandas dataframe
:param cols_to_consider: the list of columns to be inspected. this defaults
to an extensive list that is used for the MRM material, although
technically an optional parameter, this should almost always be passed.
:type cols_to_consider: a list of strings
:returns: the unique dataframe
"""
if cols_to_consider is None:
cols_to_consider=['Protein Name', 'Begin Pos', 'End Pos', 'Missed Cleavages',
'Precursor Charge', 'File Name', 'Peptide Modified Sequence', 'Average Measured Retention Time',
'light Total Area', 'heavy Total Area', 'light Precursor Mz', 'heavy Precursor Mz']
grouped = df.groupby(cols_to_consider)
index = [gp_keys[0] for gp_keys in grouped.groups.values()]
unique_df = df.reindex(index)
return unique_df
def print_fullPandas(x):
"""print_fullPandas is a helper function for printing a full dataframe
:param x: the pandas dataframe to be printed
:type x: a pandas dataframe
:returns: no return, simply prints the full dataframe and then resets the
default pandas print values.
"""
pd.set_option('display.max_rows', len(x))
pd.set_option('display.max_columns', 200)
print(x)
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
def makeNameConvDict(set1, set2):
"""calcMW is a helper function that generates a name conversion dictionary.
Two lists of strings are input and a list with the two dictionaries is output.
:param set1: a list of strings in the fist set
:type set1: list of strings
:param set2: a list of strings in the second set
:type set2: list of strings
:returns: a list of dictionaries - [0] contains from set1 to set2;
[1] contains from set2 to set1
"""
set1toset2 = {}
set2toset1 = {}
for i in range(len(set1)):
set1toset2[set1[i]] = set2[i]
set2toset1[set2[i]] = set1[i]
return [set1toset2, set2toset1]
def convertNames(fullpath, oldNames, newNames):
"""convertNames is a helper function that replaces the protein fields in an iso csv files
with a converted set of names. The old names are given as a list in oldNames.
The new names are given as a list in newNames. Returns the pandas dataframe (with new names).
**HAS EXTERNALITY - CREATES A NEW _ISO.CSV FILE IN THE PATH DIRECTORY THAT HAS THE NEW NAMES.
:param fullpath: a path string to the file to convert
:type fullpath: string
:param oldNames: a list of strings of the old names
:type oldNames: list of strings
:param newNames: a list of strings of the old names
:type newNames: list of strings
:returns: a pandas DF with the corrected names.
**HAS EXTERNALITY - CREATES A NEW _ISO.CSV FILE IN THE PATH DIRECTORY THAT HAS THE NEW NAMES.
"""
[oldToNew, newToOld] = makeNameConvDict(oldNames, newNames)
dataFrame = readIsoCSV(fullpath)
dataFrame.update(dataFrame['protein'].replace(oldToNew))
dataFrame.to_csv(fullpath[:-4]+'_newNames.csv', index=False)
return dataFrame
def concatonateIsoCSVFiles(fileList, outFile='mergedOutTemp.csv'):
"""concatonateIsoCSVFiles is a helper function that merges a series of isocsv files.
It does outputs these merged files in the working directory as "mergedOutTmp.csv"
:param fileList: a list of full path strings to the files to be merged
:type fileList: a list of strings
:returns: no return.
**HAS EXTERNALITY - CREATES A NEW _ISO.CSV FILE IN THE WORKING DIRECTORY CALLED mergedOutTmp.csv.
"""
initialData = open(fileList[0], 'r').read()
for i in fileList[1:]:
with open(i,'r') as f:
next(f)
for line in f:
initialData = initialData + line
fout = open(outFile, "w")
fout.write(initialData)
fout.close()
def addProts(addFrom, addTo, output, listOfProts):
"""Adds specific proteins from one iso csv to another
:param fileList: a list of full path strings to the files to be merged
:type fileList: a list of strings
:returns: no return.
**HAS EXTERNALITY - CREATES A NEW _ISO.CSV FILE IN THE WORKING DIRECTORY CALLED mergedOutTmp.csv.
"""
startingData = open(addTo, 'r').read()
with open(addFrom,'r') as f:
next(f)
for line in f:
for p in listOfProts:
if p in line:
startingData = startingData+line
fout = open(output, "w")
fout.write(startingData)
fout.close()
def readMSSpectraFile(datapath):
"""readMSSpectraFile is a helper function that reads a spectra .txt file (saved in the _plots directory by massacre)
:param datapath: a full path string to the file to be read
:type datapath: a sting
:returns: a list with arrays for the x values [0], y values [1], and the datapath [3]
"""
data = list(csv.reader( open(datapath, 'rU'), delimiter=' '))
xs = []
ys = []
for line in data:
xs.append(float(line[0]))