/
de_id.py
330 lines (246 loc) · 16.6 KB
/
de_id.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
import dicom
import pandas as pd
import re
import fnmatch
import os
'''
Peter Goodin, Dec 2016
'''
#Get spreadsheet type from final 3 strings of main_sheet.
def sabr_subj_ss_check(subj_ss):
subj_ss_type = subj_ss[-3:]
#Try csv file type (both comma and tab)
if subj_ss_type == 'csv' or subj_ss_type == 'tsv':
try:
try:
subj_df = pd.read_csv(subj_ss, sep = ',')
if subj_df.columns[0] != 'subj_name':
raise Exception('\n***ERROR! NON STANDARD OR MISSING COLUMN NAMES!***\nPlease ensure columns are labelled sub_name and sub_id.\n')
return
except:
subj_df = pd.read_csv(subj_ss, sep = '\t')
if subj_df.columns[0] != 'subj_name':
raise Exception('\n***ERROR! NON STANDARD OR MISSING COLUMN NAMES!***\nPlease ensure columns are labelled sub_name and sub_id.\n')
return
except:
raise Exception('\n***ERROR! UNABLE TO READ FILE!***\nPlease check your file path is correct or ensure your spreadsheet uses either commas (,) or tabbed spaces to separate cells.\n')
return
elif subj_ss_type == 'xls':
try:
subj_df = pd.read_excel(subj_ss)
if subj_df.columns[0] != 'subj_name':
raise Exception('\n***ERROR! NON STANDARD OR MISSING COLUMN NAMES!***\nPlease ensure columns are labelled sub_name and sub_id.***\n')
return
except:
raise Exception('\n***ERROR! UNABLE TO READ FILE!***\nPlease check the file path is correct or ensure your spreadsheet is an .xlsx file.\n')
return
else:
raise Exception('\n***ERROR! UNABLE TO READ SUBJECT INFORMATION!***\nPlease check the file path is correct or use either comma / tabbed separated or excel formats.***\n')
return
return(subj_df)
def sabr_scan_ss_check(scan_ss):
scan_ss_type = scan_ss[-3:]
if scan_ss_type == 'csv' or scan_ss_type == 'tsv':
try:
try:
scan_df = pd.read_csv(scan_ss, sep = ',')
if scan_df.columns[0] != 'scan_match':
raise Exception('\n***ERROR! NON STANDARD OR MISSING COLUMN NAMES!***\nPlease ensure columns are labelled scan_match, scan_type and scan_filename.')
return
except:
scan_df = pd.read_csv(scan_ss, sep = '\t')
if scan_df.columns[0] != 'scan_match':
raise Exception('\n***ERROR! NON STANDARD OR MISSING COLUMN NAMES!***\nPlease ensure columns are labelled scan_match, scan_type and scan_filename.')
return
except:
raise Exception('\n***ERROR! UNABLE TO READ FILE!***\nPlease check your file path is correct or ensure your spreadsheet uses either commas (,) or tabbed spaces to separate cells.\n')
return
elif scan_ss_type == 'xls':
try:
scan_df = pd.read_excel(scan_ss)
if scan_df.columns[0] != 'scan_match':
raise Exception('\n***ERROR! NON STANDARD OR MISSING COLUMN NAMES!***\nPlease ensure columns are labelled scan_match, scan_type and scan_filename.')
return
except:
raise Exception('\n***ERROR! UNABLE TO READ FILE!***\nPlease check the file path is correct or ensure your spreadsheet is an .xlsx file.\n')
return
else:
raise Exception('\n***ERROR! UNABLE TO READ SUBJECT INFORMATION!***\nPlease use either comma / tabbed separated or excel formats.***\n')
return
return(scan_df)
def sabr_deid(subj_info, scan_df, raw_dir, deid_outdir):
#Join raw dir with subject name (assumes directory structure is ./rawdir/subj_name/...
subj_main_dir = os.path.join(raw_dir, subj_info['subj_name'])
new_id = subj_info['subj_id']
#Get list of sessions within main subj directory, make dir and loop over sessions.
subj_sessions = os.walk(subj_main_dir).next()[1]
subj_sessions.sort()
print('\n***{} has {} session(s)'.format(new_id, len(subj_sessions)))
#Create deidentified main (root) directory for subject
subj_deid_main_dir = os.path.join(deid_outdir, new_id)
try:
os.mkdir(subj_deid_main_dir)
except:
print('\nDirectory {} exists\n'.format(subj_deid_main_dir))
#WARNING! LAZY CODING AHEAD!
if len(subj_sessions) == 0:
raise Exception('\n***ERROR! NUMBER OF SESSIONS = 0!***\nPlease check directory structure of {}'.format(subj_info['subj_name']))
elif len(subj_sessions) == 1:
session = subj_sessions[0]
subj_session_dir = os.path.join(subj_main_dir, session)
subj_deid_session_dir = os.path.join(subj_deid_main_dir, 'ses-01')
try:
os.mkdir(subj_deid_session_dir)
except:
print('\nSession folder {} exists\n'.format(subj_deid_session_dir))
for j, scan_type in enumerate(scan_df['scan_type']):
subj_deid_meta_dir = os.path.join(subj_deid_session_dir, scan_type)
try:
os.mkdir(subj_deid_meta_dir)
except:
print('Meta directory {} exists.'.format(scan_type))
#Match common sequence substring with path in os.walk
for root, dr, files in os.walk(subj_session_dir):
match = scan_df.scan_match[j]
match_regex = fnmatch.translate(match)
found = re.search(match_regex, root)
#print('\n***{}***\n'.format(found))
#If match, start deid process. If not, move onto next folder.
if found != None:
subj_deid_sequence_dir = os.path.join(subj_deid_meta_dir, scan_df.scan_filename[j])
print('Making directory {}'.format(subj_deid_sequence_dir))
try:
os.mkdir(subj_deid_sequence_dir) #Make "housing" directory to keep dicoms of different sequences but same meta-category separate.
except:
print('\n***SEQUENCE DIRECTORY ALREADY EXISTS!***\nSkipping.')
continue
#Create list of dicoms in sequence dir rather than use
#files (more control in case any non-dicoms)
anon_files = os.listdir(root)
anon_files = [x for x in anon_files if 'nii' not in x] #Remove any previous nii files that may be present < To do - expand to other file types (mgh, analyze, etc)
anon_files.sort()
for anon_file in anon_files:
#Read files in 1 at a time, remove the remove / alter the below tags.
dcm = dicom.read_file(os.path.join(root, anon_file), force = True) #Uses force = True incase dicoms haven't had identifier added to header
#Strip aquisition date information
aqusition_date_list = [[0x0008,0x0020],[0x0008,0x0021],[0x0008,0x0022],[0x0008,0x0023]]
for tag in aqusition_date_list:
try:
dcm[hex(tag[0]), hex(tag[1])].value = ''
except:
print('Tag {} {} does not exist in {}. Moving to next tag'.format(hex(tag[0]), hex(tag[1]),scan_df.scan_filename[j]))
#Strip aquisition time information
aqusition_time_list = [[0x0008,0x0030],[0x0008,0x0031],[0x0008,0x0032],[0x0008,0x0033]]
for tag in aqusition_time_list:
try:
dcm[hex(tag[0]), hex(tag[1])].value = ''
except:
print('Tag {} {} does not exist in {}. Moving to next tag'.format(hex(tag[0]), hex(tag[1]),scan_df.scan_filename[j]))
#Strip physician information
physician_list = [[0x0008,0x0090],[0x0008,0x1050]]
for tag in physician_list:
try:
dcm[hex(tag[0]), hex(tag[1])].value = ''
except:
print('Tag {} {} does not exist in {}. Moving to next tag'.format(hex(tag[0]), hex(tag[1]),scan_df.scan_filename[j]))
#Strip study description
#dcm[0x0008,0x1030].value = ''
#Strip subject name / patient ID
subj_name_list = [[0x0010,0x0010],[0x0010,0x0020]]
#PatientName, PatientID
for tag in subj_name_list:
try:
dcm[hex(tag[0]), hex(tag[1])].value = new_id
except:
print('Tag {} {} does not exist in {}. Moving to next tag'.format(hex(tag[0]), hex(tag[1]),scan_df.scan_filename[j]))
#Strip subject attributes
subj_attrib_list = [[0x0010,0x0030],[0x0010,0x1010],[0x0010,0x1020],[0x0010,0x1030]]
#, DoB, Age, PatientHeight, PatientWeight
for tag in subj_attrib_list:
try:
dcm[hex(tag[0]), hex(tag[1])].value = ''
except:
print('Tag {} {} does not exist in {}. Moving to next tag'.format(hex(tag[0]), hex(tag[1]),scan_df.scan_filename[j]))
#Write anonymised file
dicom.write_file(os.path.join(subj_deid_sequence_dir, anon_file),dcm)
elif len(subj_sessions) > 1:
for sn, session in enumerate(subj_sessions):
#MAKE DIRECTORIES BUT ZERO PAD SESSION
subj_deid_session_dir = os.path.join(subj_deid_main_dir, 'ses-' '{:02d}'.format(sn+1))
try:
os.mkdir(subj_deid_session_dir)
except:
print('\nSession folder {} exists\n'.format(subj_deid_session_dir))
#Session folder for identifiable subject
subj_session_dir = os.path.join(subj_main_dir, session)
#Loop over scan folder types within scan dataframe (anat, task, etc)
for j, scan_type in enumerate(scan_df['scan_type']):
subj_deid_meta_dir = os.path.join(subj_deid_session_dir, scan_type)
try:
os.mkdir(subj_deid_meta_dir)
except:
print('Meta directory {} exists.'.format(scan_type))
#Match common sequence substring with path in os.walk
for root, dr, files in os.walk(subj_session_dir):
match = scan_df.scan_match[j]
match_regex = fnmatch.translate(match)
found = re.search(match_regex, root)
#If match, start deid process, not not, move onto next folder.
if found != None:
subj_deid_sequence_dir = os.path.join(subj_deid_meta_dir, scan_df.scan_filename[j])
print('Making directory {}'.format(subj_deid_sequence_dir))
try:
os.mkdir(subj_deid_sequence_dir) #Make "housing" directory to keep dicoms of different sequences but same meta-category separate.
except:
print('\n***SEQUENCE DIRECTORY ALREADY EXISTS!***\nSkipping.')
continue
#Create list of dicoms in sequence dir rather than use
#files (more control in case any non-dicoms)
anon_files = os.listdir(root)
anon_files = [x for x in anon_files if 'nii' not in x] #Remove any previous nii files that may be present < To do - expand to other file types (mgh, analyze, etc)
anon_files.sort()
for anon_file in anon_files:
#Read files in 1 at a time, remove the remove / alter the below tags.
dcm = dicom.read_file(os.path.join(root, anon_file), force = True) #Uses force = True incase dicoms haven't had identifier added to header
#Strip aquisition date information
aqusition_date_list = [[0x0008,0x0020],[0x0008,0x0021],[0x0008,0x0022],[0x0008,0x0023]]
for tag in aqusition_date_list:
try:
dcm[hex(tag[0]), hex(tag[1])].value = ''
except:
print('Tag {} {} does not exist in {}. Moving to next tag'.format(hex(tag[0]), hex(tag[1]),scan_df.scan_filename[j]))
#Strip aquisition time information
aqusition_time_list = [[0x0008,0x0030],[0x0008,0x0031],[0x0008,0x0032],[0x0008,0x0033]]
for tag in aqusition_time_list:
try:
dcm[hex(tag[0]), hex(tag[1])].value = ''
except:
print('Tag {} {} does not exist in {}. Moving to next tag'.format(hex(tag[0]), hex(tag[1]),scan_df.scan_filename[j]))
#Strip physician information
physician_list = [[0x0008,0x0090],[0x0008,0x1050]]
for tag in physician_list:
try:
dcm[hex(tag[0]), hex(tag[1])].value = ''
except:
print('Tag {} {} does not exist in {}. Moving to next tag'.format(hex(tag[0]), hex(tag[1]),scan_df.scan_filename[j]))
#Strip study description
#dcm[0x0008,0x1030].value = ''
#Strip subject name / patient ID
subj_name_list = [[0x0010,0x0010],[0x0010,0x0020]]
#PatientName, PatientID
for tag in subj_name_list:
try:
dcm[hex(tag[0]), hex(tag[1])].value = new_id
except:
print('Tag {} {} does not exist in {}. Moving to next tag'.format(hex(tag[0]), hex(tag[1]),scan_df.scan_filename[j]))
#Strip subject attributes
subj_attrib_list = [[0x0010,0x0030],[0x0010,0x1010],[0x0010,0x1020],[0x0010,0x1030]]
#, DoB, Age, PatientHeight, PatientWeight
for tag in subj_attrib_list:
try:
dcm[hex(tag[0]), hex(tag[1])].value = ''
except:
print('Tag {} {} does not exist in {}. Moving to next tag'.format(hex(tag[0]), hex(tag[1]),scan_df.scan_filename[j]))
#Write anonymised file
dicom.write_file(os.path.join(subj_deid_sequence_dir, anon_file),dcm)
return(subj_deid_main_dir, subj_sessions, new_id)