-
Notifications
You must be signed in to change notification settings - Fork 0
/
AndersonDarling.py
127 lines (86 loc) · 4.15 KB
/
AndersonDarling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""
Script to implement Anderson darling test for comparison of distributions. Takes 2 files that are generated by
distances2.py script: Random_to_ref_distances_as_list.txt and a file with name in form
Query_UCEs_with_distances_to_ref_Ear_eye_nervous_Pooledbreakpoints_1kbcluster.txt
Ruth McCole
April 3rd 2017
Copyright 2017 Harvard University, Wu Lab
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import argparse
import pandas as pd
from scipy import stats
def get_args():
parser = argparse.ArgumentParser(description="Description")
parser.add_argument('-q', "--queryFile", type=str,
help='Filename for query to ref distances')
parser.add_argument('-r', '--randomFile', type=str,
help='Filename for random to ref distances')
parser.add_argument('-l', '--limit', type=float,
help='Limit for lengths to cutoff for distribution comparison')
parser.add_argument('-o', '--outputStem', type=str, default='Test',
help='Output stem')
return parser.parse_args()
def getDataIntoPandas(strFilename):
#NB: this assumes your files DO have headers
print 'Getting data from {0}'.format(strFilename)
pdData = pd.read_csv(strFilename, sep='\t')
return pdData
def getDataIntoPandasNoHeader(strFilename, arHeaders):
#This is for files without headers
pdData = pd.read_csv(strFilename, sep='\t', header=None, names=arHeaders)
return pdData
def savePanda(pdData, strFilename):
pdData.to_csv(strFilename, sep='\t', index=False)
def applyLimit(pdDistances, floatLimit, strColumnName):
#Now have to make this work as a series, or make randoms a 1 column dataframe
BoolPdLimit = pdDistances[strColumnName] < floatLimit
pdDistancesLimit = pdDistances[BoolPdLimit]
print 'Limit of {0}kb applied'.format(floatLimit)
return pdDistancesLimit
def getAD(pdQuery, pdRef):
arQuery = pdQuery.values.flatten().tolist()
arRef = pdRef.values.flatten().tolist()
tupAD = stats.anderson_ksamp([arQuery, arRef])
return tupAD
def processTupAD(tupAD, strOutputStem):
print 'AD test stat is {0}'.format(str(tupAD[0]))
print 'AD critical values at 25%, 10%, 5%, 2.5%, 1% respecitively are:'
print tupAD[1]
arCritical = tupAD[1]
print 'Approx p-value is {0}'.format(str(tupAD[2]))
with open('AD_result_{0}.txt'.format(strOutputStem), 'a') as fH:
fH.write('AD test stat is {0}'.format(str(tupAD[0])) + "\n")
fH.write('AD critical values at 25%, 10%, 5%, 2.5%, 1% respectively are:' +'\n')
fH.write(str(tupAD[1]) + '\n')
fH.write('Approx p-value is {0}'.format(str(tupAD[2])))
def main():
args = get_args()
strOutputStem = args.outputStem
pdQueryFull = getDataIntoPandas(args.queryFile)
pdQueryDistances = pdQueryFull[['Distance_Kb']]
seriesRandomDistances = pd.read_csv(args.randomFile, squeeze=True)
pdRandomDistances = getDataIntoPandasNoHeader(args.randomFile, ['Random_distances_Kb'])
if args.limit:
floatLimit = args.limit
print 'limit of {0}kb'.format(str(floatLimit))
pdQueryLimit = applyLimit(pdQueryDistances, floatLimit, 'Distance_Kb')
pdRandomsLimit = applyLimit(pdRandomDistances, floatLimit, 'Random_distances_Kb')
#Save for future ref
savePanda(pdQueryLimit, 'Query_limit_to_{0}kb.txt'.format(str(floatLimit)))
savePanda(pdRandomsLimit, 'Randoms_limited_to_{0}kb.txt'.format(str(floatLimit)))
tupAD = getAD(pdQueryLimit, pdRandomsLimit)
processTupAD(tupAD, strOutputStem)
else:
tupAD = getAD(pdQueryDistances, pdRandomDistances)
processTupAD(tupAD, strOutputStem)
if __name__ == "__main__":
main()