/
comps_2_sim_data.py
329 lines (237 loc) · 14.1 KB
/
comps_2_sim_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
import os
import json
import sys
import string
from utils import warn_p, debug_p
from dtk.utils.simulation.OutputParser import CompsDTKOutputParser #as parser
from dtk.utils.simulation.COMPSJavaInterop import Experiment, QueryCriteria, Client, Configuration, Priority
from dtk.utils.core.DTKSetupParser import DTKSetupParser
from dtk.utils.parsers.json2dict import json2dict
from kariba_settings import campaign_days, reports_channels, channels, objectives_channel_codes, calibration_data_file
from kariba_utils import get_sim_key, get_sim_group_key, sim_meta_2_itn_level, sim_meta_2_drug_cov, sim_meta_2_temp_h, sim_meta_2_const_h
def comps_login():
setup = DTKSetupParser()
Client.Login(setup.get('HPC','server_endpoint'))
# merges a set of sweep calibration files in different base_dirs in a single base_dir
def combine_sweep_results(base_dirs, output_dir):
calib_output = {}
tags_report_comb = {}
for base_dir in base_dirs:
print "Processing calibration in " + base_dir
# get this base_dir's calibration results
with open(os.path.join(base_dir, calibration_data_file), 'r') as calib_f:
calib = json.load(calib_f)
# update each sim group with the respective group sims in this base_dir's calibration results
for sim_group, sim_keys in calib.iteritems():
calib_output[sim_group].update(sim_keys)
# update tags information
with open(os.path.join(base_dir, tags_report_data_file), 'r') as tags_report_f:
tags_report = json.load(tags_report_f)
for param, values in tags_report.iteritems():
if param not in tags_report_comb:
tags_report_comb[param] = []
tags_report_comb[param] = set(tags_report_comb[param])
tags_report_comb[param].update(values)
tags_report_comb[param] = list(tags_report_comb[param])
print "DONE"
print
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print "Writing calibration output to " + os.path.join(output_dir, calibration_data_file)
with open(os.path.join(output_dir, calibration_data_file), 'w') as comb_calib_f:
json.dump(calib_output, comb_calib_f)
print "DONE"
print "Writing tags to " + os.path.join(output_dir, tags_report_data_file)
with open(os.path.join(output_dir, tags_report_data_file), 'w') as tags_report_f:
json.dump(tags_report_comb, tags_report_f)
print "DONE"
# get the specified channels/ reports output from a specified experiment sweep
def get_sweep_results(sim_meta_config_files, calib_file_path, tags_data_file_path):
# Login to COMPs
comps_login()
# find total number of simulations across given experiment files
num_sims = 0
for sim_meta_config_file in sim_meta_config_files:
with open(sim_meta_config_file) as metadata_file:
metadata = json.loads(metadata_file.read())
num_sims = num_sims + len(metadata['sims'])
# Download simulations locally
# sample sim meta config file (like "C:\\Users\\Mnikolov\\Zambia-raw\\dtk-scripts\\1node\\simulations\\Sinamalina_Sinazongwe_Calibration_e9979059-33f8-e411-93f9-f0921c16b9e7.json")
#print 'Downloading simulations from experiment ' + str(sim_meta_config_files) + '...'
# simulations tag data structure: accumulates sims meta information from sims tags
tag_data = {
'ITN trajectory': [],\
'Drug coverage per round': [],\
'Temporary habitat scale': [],\
'Constant habitat scale': []\
}
# iterate through experiments
calib_output = {}
# count processed sims to updated progress
count = 0
for sim_meta_config_file in sim_meta_config_files:
# construct experiment directory structure
with open(sim_meta_config_file) as metadata_file:
metadata = json.loads(metadata_file.read())
output_path = metadata['sim_root']
exp_id = metadata['exp_id']
exp_name = metadata['exp_name']
sim_dir_map = CompsDTKOutputParser.createSimDirectoryMap(exp_id)
# get all successfully completed sims in experiment
for sim_id, sim in metadata['sims'].items():
# get path to the sim timeseries channels data
timeseries_path = os.path.join(sim_dir_map[sim_id],'output', 'InsetChart.json')
#get sim timeseries channels data; json2dict returns None if timeseries_path points to non-existing file, which is the case if the sim has not successfully finished
sim_output = json2dict(timeseries_path)
# only download successfully completed simulations
if sim_output == None:
continue
# delete all but the specified channels
for channel in sim_output['Channels'].keys():
if not channel in channels:
del(sim_output['Channels'][channel])
# process specified reports
report_channels_data = {}
if not reports_channels == None:
report_channels_data = process_reports(reports_channels, sim_dir_map, sim_id)
# record sim meta information including sim tags
tags_path = os.path.join(sim_dir_map[sim_id], 'tags.json')
f = open(tags_path, 'r')
tags = f.read()
sim_meta = ast.literal_eval(tags)
append_tag_data(sim_meta, tag_data)
# construct sim group key and sim key
x_temp_h = sim_meta_2_temp_h(sim_meta)
const_h = sim_meta_2_const_h(sim_meta)
itn_level = sim_meta_2_itn_level(sim_meta)
drug_coverage_level = sim_meta_2_drug_cov(sim_meta)
sim_key = get_sim_key(x_temp_h, const_h, itn_level, drug_coverage_level)
sim_group_key = get_sim_group_key(itn_level, drug_coverage_level)
# store sim channels data
if sim_group_key not in calib_output:
calib_output[sim_group_key] = {}
calib_output[sim_group_key][sim_key] = {
# can add/remove data entries depending on needs
'prevalence': sim_output['Channels']['New Diagnostic Prevalence']['Data'],
'reinfections': report_channels_data['reinfections'],
'meta':sim_meta,
'sim_id':sim_id
}
'''
count = count + 1
percent_complete = 100*count/(num_sims+0.0)
sys.stdout.write('\r')
sys.stdout.write('%2f %%' % percent_complete)
#sys.stdout.write('%d' % count)
sys.stdout.flush()
'''
print ""
print "Writing files..."
with open(calib_file_path, 'w') as calib_f:
json.dump(calib_output, calib_f)
print str(len(calib_output)) + ' simulation results saved to ' + calib_file_path
with open(tags_data_file_path, 'w') as tags_f:
json.dump(tag_data, tags_f)
print 'Meta data tags saved to ' + tags_data_file_path
print ""
return calib_f
# process specified reports
def process_reports(reports, sim_dir_map, sim_id):
reports_channels_data = {}
for report in reports:
if report == 'reinfections':
reports_channels_data['reinfections'] = process_reinfections_report(sim_dir_map, sim_id)
return reports_channels_data
# extract re-infections data from patient drug survey reports
def process_reinfections_report(sim_dir_map, sim_id):
survey_report_output = {}
reinfections = {}
for i,day in enumerate(campaign_days):
# get reports before and after campaign
survey_day_prior = day - 5
survey_day_after = day + 5
survey_report_output[survey_day_after] = {}
survey_report_prior_path = os.path.join(sim_dir_map[sim_id],'output', 'MalariaSurveyJSONAnalyzer_Day_' + str(survey_day_prior) + '_0.json')
survey_report_after_path = os.path.join(sim_dir_map[sim_id],'output', 'MalariaSurveyJSONAnalyzer_Day_' + str(survey_day_after) + '_0.json')
survey_report_prior = json2dict(survey_report_prior_path)
survey_report_after = json2dict(survey_report_after_path)
# debug statements
#print len(survey_report_prior['patient_array'])
#print len(survey_report_after['patient_array'])
# re-index reports data by patient id
survey_report_prior_reindexed = {}
# index patient by id which perhaps is a bit more sensible than the current reporter output
for patient in survey_report_prior['patient_array']:
survey_report_prior_reindexed[patient['id']] = patient
survey_report_after_reindexed = {}
for patient in survey_report_after['patient_array']:
survey_report_after_reindexed[patient['id']] = patient
survey_report_prior = survey_report_prior_reindexed
survey_report_after = survey_report_after_reindexed
# determine if each patient in report is reinfected for each pair of consecutive surveys
total_tested_patients = 0
num_reinfected = 0
count_not_found = 0
for patient_id, patient in survey_report_prior.iteritems():
treatment_prior = ""
for drugs in patient['treatment'][0]:
if drugs != "":
treatment_prior = drugs
if patient_id in survey_report_after:
patient_after = survey_report_after[patient_id]
else:
continue
treatment_after = ""
for drugs in patient_after['treatment'][0]:
if drugs != "":
treatment_after = drugs
if ' + '+treatment_prior in treatment_after and treatment_prior != '':
treatment_after = string.replace(treatment_after, ' + ' + treatment_prior, '', maxreplace = 1)
else:
treatment_after = string.replace(treatment_after, treatment_prior, '', maxreplace = 1)
#print "clean after treatment " + str(treatment_after)
survey_report_output[survey_day_after][patient_id] = {}
survey_report_output[survey_day_after][patient_id]['initial_age'] = patient['initial_age']
survey_report_output[survey_day_after][patient_id]['treatment'] = treatment_after
if i > 0:
survey_day_1 = campaign_days[i - 1] + 5
survey_day_2 = survey_day_after
if patient_id in survey_report_output[survey_day_1]:
treatment_1 = survey_report_output[survey_day_1][patient_id]['treatment']
treatment_2 = treatment_after
if treatment_1 != '' and treatment_2 != '':
total_tested_patients = total_tested_patients + 1
# DO NOT APPLY FOR MDA CAMAPIGNS WITHOUT MODIFICATION!!!!!!
if ('Artemether' in treatment_1) and ('Artemether' in treatment_2):
num_reinfected = num_reinfected + 1
'''
# for MDA do something along the lines of
if (('DHA' in patient['treatment'] and 'Vehicle' in patient['treatment']) or 'Artemether' in patient['treatment']) and\
(('DHA' in survey_report_output[second_survey_day][patient_id]['treatment'] and 'Vehicle' in survey_report_output[second_survey_day][patient_id]['treatment']) or 'Artemether' in survey_report_output[second_survey_day][patient_id]['treatment']):
num_reinfected = num_reinfected + 1
'''
else:
#print "patient " + str(patient_id) + " not found in second_survey day (" + str(second_survey_day) + ") report"
count_not_found = count_not_found + 1
if i > 0:
if total_tested_patients != 0:
reinfections['round_' + str(i) + '_' + str(i+1)] = num_reinfected/(total_tested_patients + 0.0)
else:
reinfections['round_' + str(i) + '_' + str(i+1)] = 'nan'
return reinfections
def append_tag_data(sim_meta, tag_data):
itn_level_struct = ast.literal_eval(sim_meta['add_ITN_mult'])
itn_level = itn_level_struct[0][1][0][0][1]
drug_coverage_level_struct = ast.literal_eval(sim_meta['add_drug_multi_campaigns'])
drug_coverage_level = drug_coverage_level_struct[0][1][0]['coverage']
x_temp_h = float(sim_meta['x_Temporary_Larval_Habitat'])
const_h_struct = ast.literal_eval(sim_meta['scale_larval_habitats_single'])
const_h = float(const_h_struct[0][1][1])
if itn_level not in tag_data['ITN trajectory']:
tag_data['ITN trajectory'].append(itn_level)
if drug_coverage_level not in tag_data['Drug coverage per round']:
tag_data['Drug coverage per round'].append(drug_coverage_level)
if x_temp_h not in tag_data['Temporary habitat scale']:
tag_data['Temporary habitat scale'].append(x_temp_h)
if const_h not in tag_data['Constant habitat scale']:
tag_data['Constant habitat scale'].append(const_h)