/
xvg_average_residues_ratios.py
executable file
·290 lines (238 loc) · 10.5 KB
/
xvg_average_residues_ratios.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
#generic python modules
import argparse
import operator
from operator import itemgetter
import sys, os, shutil
import os.path
##########################################################################################
# RETRIEVE USER INPUTS
##########################################################################################
#=========================================================================================
# create parser
#=========================================================================================
version_nb = "0.0.1"
parser = argparse.ArgumentParser(prog = 'xvg_average_residues_ratios', usage='', add_help = False, formatter_class = argparse.RawDescriptionHelpFormatter, description =\
'''
***************************************************
v''' + version_nb + '''
author: Jean Helie (jean.helie@bioch.ox.ac.uk)
git: https://github.com/jhelie/xvg_average_residues
***************************************************
[ DESCRIPTION ]
This script calculate the average of residues ratios from residues densities contained
in several xvg files.
It calculates the avg and (unbiased) std dev and can deal with NaN.
The ratios are calculated for each supplied file, then averaged proportional to the weight
of that file.
NB:
the script may give out a warning 'return np.mean(x,axis)/factor', it's ok. it's just
scipy warning us that there were only nans on a row, the result will be a nan as we
expect (see this thread: https://github.com/scipy/scipy/issues/2898).
[ USAGE ]
Option Default Description
-----------------------------------------------------
-f : xvg file(s)
-o residues_avg : name of outptut file
--comments @,# : lines starting with these characters will be considered as comment
Other options
-----------------------------------------------------
--version : show version number and exit
-h, --help : show this menu and exit
''')
#options
parser.add_argument('-f', nargs='+', dest='xvgfilenames', help=argparse.SUPPRESS, required=True)
parser.add_argument('-o', nargs=1, dest='output_file', default=["residues_avg"], help=argparse.SUPPRESS)
parser.add_argument('--comments', nargs=1, dest='comments', default=['@,#'], help=argparse.SUPPRESS)
#other options
parser.add_argument('--version', action='version', version='%(prog)s v' + version_nb, help=argparse.SUPPRESS)
parser.add_argument('-h','--help', action='help', help=argparse.SUPPRESS)
#=========================================================================================
# store inputs
#=========================================================================================
args = parser.parse_args()
args.output_file = args.output_file[0]
args.comments = args.comments[0].split(',')
#=========================================================================================
# import modules (doing it now otherwise might crash before we can display the help menu!)
#=========================================================================================
#generic science modules
try:
import numpy as np
except:
print "Error: you need to install the np module."
sys.exit(1)
try:
import scipy
import scipy.stats
except:
print "Error: you need to install the scipy module."
sys.exit(1)
#=======================================================================
# sanity check
#=======================================================================
if len(args.xvgfilenames) == 1:
print "Error: only 1 data file specified."
sys.exit(1)
for f in args.xvgfilenames:
if not os.path.isfile(f):
print "Error: file " + str(f) + " not found."
sys.exit(1)
##########################################################################################
# FUNCTIONS DEFINITIONS
##########################################################################################
#=========================================================================================
# data loading
#=========================================================================================
def load_xvg(): #DONE
global nb_rows
global nb_cols
global weights
global distance
global data_ratios
nb_rows = 0
nb_cols = 0
weights = np.ones(len(args.xvgfilenames))
for f_index in range(0,len(args.xvgfilenames)):
#display progress
progress = '\r -reading file ' + str(f_index+1) + '/' + str(len(args.xvgfilenames)) + ' '
sys.stdout.flush()
sys.stdout.write(progress)
#get file content
filename = args.xvgfilenames[f_index]
with open(filename) as f:
lines = f.readlines()
#determine legends and nb of lines to skip
tmp_nb_rows_to_skip = 0
for l_index in range(0,len(lines)):
line = lines[l_index]
if line[0] in args.comments:
tmp_nb_rows_to_skip += 1
if "weight" in line:
if "-> weight = " in line:
weights[f_index] = float(line.split("-> weight = ")[1])
if weights[f_index] < 0:
print "\nError: the weight in file " + str(filename) + " should be a positive number."
print " -> " + str(line)
sys.exit(1)
else:
print "\nWarning: keyword 'weight' found in the comments of file " + str(filename) + ", but weight not read in as the format '-> weight = ' wasn't found."
#get data
tmp_data = np.loadtxt(filename, skiprows = tmp_nb_rows_to_skip)
#check that each file has the same number of data rows
if f_index == 0:
nb_rows = np.shape(tmp_data)[0]
distance = np.zeros((nb_rows, 1)) #distance from cluster
data_residues_basic = np.zeros((nb_rows, len(args.xvgfilenames)))
data_residues_hphob = np.zeros((nb_rows, len(args.xvgfilenames)))
data_residues_total = np.zeros((nb_rows, len(args.xvgfilenames)))
else:
if np.shape(tmp_data)[0] != nb_rows:
print "Error: file " + str(filename) + " has " + str(np.shape(tmp_data)[0]) + " data rows, whereas file " + str(args.xvgfilenames[0]) + " has " + str(nb_rows) + " data rows."
sys.exit(1)
#check that each file has the same number of columns
if f_index == 0:
nb_cols = np.shape(tmp_data)[1]
else:
if np.shape(tmp_data)[1] != nb_cols:
print "Error: file " + str(filename) + " has " + str(np.shape(tmp_data)[1]) + " data columns, whereas file " + str(args.xvgfilenames[0]) + " has " + str(nb_cols) + " data columns."
sys.exit(1)
#check that each file has the same first column
if f_index == 0:
distance[:,0] = tmp_data[:,0]
else:
if not np.array_equal(tmp_data[:,0],distance[:,0]):
print "\nError: the first column of file " + str(filename) + " is different than that of " + str(args.xvgfilenames[0]) + "."
sys.exit(1)
#store data
data_residues_basic[:,f_index] = tmp_data[:,1]
data_residues_hphob[:,f_index] = tmp_data[:,3]
data_residues_total[:,f_index] = tmp_data[:,5]
#remove non meaningful zeros (i.e. not sampled)
data_residues_total[data_residues_total == 0] = np.nan
data_residues_basic[np.isnan(data_residues_total)] = np.nan
data_residues_hphob[np.isnan(data_residues_total)] = np.nan
#calculate ratios
tmp_div = data_residues_basic + data_residues_hphob
tmp_div[np.isnan(tmp_div)] = 1
data_ratios = data_residues_hphob / tmp_div * 100
return
#=========================================================================================
# core functions
#=========================================================================================
def calculate_avg(): #DONE
global data_ratios_avg
global data_ratios_std
#remove nan values of the weights
weights_nan = np.zeros((nb_rows, 1))
weights_nan_sq = np.zeros((nb_rows, 1))
nb_files = np.ones((nb_rows, 1)) * len(args.xvgfilenames)
tmp_weights_nan = np.zeros((nb_rows, len(args.xvgfilenames)))
for r in range(0, nb_rows):
tmp_weights_nan[r,:] = weights
for f_index in range(0, len(args.xvgfilenames)):
if np.isnan(data_ratios[r,f_index]):
tmp_weights_nan[r,f_index] = 0
nb_files[r,0] -= 1
weights_nan[:,0] = np.nansum(tmp_weights_nan, axis = 1)
weights_nan_sq[:,0] = np.nansum(tmp_weights_nan**2, axis = 1)
weights_nan[weights_nan == 0] = 1
#avg
data_ratios_avg = np.zeros((nb_rows,1))
data_ratios_avg[:,0] = scipy.stats.nanmean(data_ratios * weights * nb_files / weights_nan, axis = 1)
#std
tmp_std = np.zeros((nb_rows, 1))
tmp_std[:,0] = np.nansum(weights * (data_ratios - data_ratios_avg[:,0:1])**2, axis = 1)
tmp_div = np.copy((weights_nan)**2 - weights_nan_sq)
tmp_div[tmp_div == 0] = 1
data_ratios_std = np.sqrt(weights_nan / tmp_div * tmp_std)
return
#=========================================================================================
# outputs
#=========================================================================================
def write_xvg(): #DONE
#open files
filename_xvg = os.getcwd() + '/' + str(args.output_file) + '.xvg'
output_xvg = open(filename_xvg, 'w')
#general header
output_xvg.write("# [average xvg - written by xvg_average_residues_ratios v" + str(version_nb) + "]\n")
tmp_files = ""
for f in args.xvgfilenames:
tmp_files += "," + str(f)
output_xvg.write("# - files: " + str(tmp_files[1:]) + "\n")
if np.sum(weights) > len(args.xvgfilenames):
output_xvg.write("# -> weight = " + str(np.sum(weights)) + "\n")
#xvg metadata
output_xvg.write("@ title \"Average xvg\"\n")
output_xvg.write("@ xaxis label \"distance from cluster z axis (Angstrom)\"\n")
output_xvg.write("@ yaxis label \"basic / (basic + hydrophobic) in protein residues\"\n")
output_xvg.write("@ autoscale ONREAD xaxes\n")
output_xvg.write("@ TYPE XY\n")
output_xvg.write("@ view 0.15, 0.15, 0.95, 0.85\n")
output_xvg.write("@ legend on\n")
output_xvg.write("@ legend box on\n")
output_xvg.write("@ legend loctype view\n")
output_xvg.write("@ legend 0.98, 0.8\n")
output_xvg.write("@ legend length 2\n")
output_xvg.write("@ s0 legend \"b/(b+h) (avg)\"\n")
output_xvg.write("@ s1 legend \"b/(b+h) (std)\"\n")
#data
for r in range(0, nb_rows):
results = str(distance[r,0])
results += " " + "{:.6e}".format(data_ratios_avg[r,0]) + " " + "{:.6e}".format(data_ratios_std[r,0])
output_xvg.write(results + "\n")
output_xvg.close()
return
##########################################################################################
# MAIN
##########################################################################################
print "\nReading files..."
load_xvg()
print "\n\nWriting average file..."
calculate_avg()
write_xvg()
#=========================================================================================
# exit
#=========================================================================================
print "\nFinished successfully! Check result in file '" + args.output_file + ".xvg'."
print ""
sys.exit(0)