def load_clean_metadata(f): m = utils.clean_metadata(disk_access.load_metadata_from_file(f)[0]) pid = _extract_id_from_path(f) slicename = os.path.basename(f) _enhance_metadata(m, pid, slicename) return m
def get_patient_data(indices, wanted_input_tags, wanted_output_tags, set="train", preprocess_function=None, testaug=False): """ return a dict with the desired data matched to the required tags :param wanted_data_tags: :return: """ def initialise_empty(): """Initialise empty chunk """ result = { "input": {}, "output": {}, } no_samples = _config().batch_size * _config().batches_per_chunk vector_size = (no_samples, ) matrix_size = (no_samples, 600) OUTPUT_DATA_SIZE_TYPE = { "systole": (matrix_size, "float32"), "diastole": (matrix_size, "float32"), "average": (matrix_size, "float32"), "systole:onehot": (matrix_size, "float32"), "diastole:onehot": (matrix_size, "float32"), "systole:class_weight": (matrix_size, "float32"), "diastole:class_weight": (matrix_size, "float32"), "systole:value": (vector_size, "float32"), "diastole:value": (vector_size, "float32"), "patients": (vector_size, "int32"), "slices": (vector_size, "int32"), "area_per_pixel": (no_samples, ), } for tag in wanted_output_tags: if tag in OUTPUT_DATA_SIZE_TYPE: size, dtype = OUTPUT_DATA_SIZE_TYPE[tag] result["output"][tag] = np.zeros(size, dtype=dtype) for tag in wanted_input_tags: if tag in _config().data_sizes: chunk_shape = list(_config().data_sizes[tag]) chunk_shape[0] = chunk_shape[0] * _config().batches_per_chunk chunk_shape = tuple(chunk_shape) result["input"][tag] = np.zeros(chunk_shape, dtype="float32") if "classification_correction_function" in wanted_output_tags: result["output"]["classification_correction_function"] = [ lambda x: x ] * no_samples return result result = initialise_empty() if set not in patient_folders: raise ValueError("Don't know the dataset %s" % set) folders = [ patient_folders[set][i] for i in indices if 0 <= i < num_patients[set] ] # Iterate over folders for i, folder in enumerate(folders): # find the id of the current patient in the folder name (=safer) id = _extract_id_from_path(folder) files = _in_folder(folder) patient_result = dict() metadatas_result = dict() # function for loading and cleaning metadata. Only use the first frame def load_clean_metadata(f): m = utils.clean_metadata(disk_access.load_metadata_from_file(f)[0]) pid = _extract_id_from_path(f) slicename = os.path.basename(f) _enhance_metadata(m, pid, slicename) return m # Iterate over input tags for tag in wanted_input_tags: if tag.startswith("sliced:data:singleslice"): if "4ch" in tag: l = [sax for sax in files if "4ch" in sax] elif "2ch" in tag: l = [sax for sax in files if "2ch" in sax] else: l = [sax for sax in files if "sax" in sax] if not l: if hasattr(_config(), 'check_inputs') and _config().check_inputs: print( "Warning: patient %d has no images of this type" % id) continue if "middle" in tag: # Sort sax files, based on the integer in their name l.sort(key=lambda f: int( re.findall("\d+", os.path.basename(f))[0])) f = l[len(l) / 2] else: f = random.choice(l) patient_result[tag] = disk_access.load_data_from_file(f) metadatas_result[tag] = load_clean_metadata(f) slice_id = _extract_slice_id_from_path(f) if "difference" in tag: for j in range(patient_result[tag].shape[0] - 1): patient_result[tag][j] -= patient_result[tag][j + 1] patient_result[tag] = np.delete(patient_result[tag], -1, 0) elif tag.startswith("sliced:data:chanzoom:4ch"): pass # done by the next one elif tag.startswith("sliced:data:chanzoom:2ch"): l_4ch = [sax for sax in files if "4ch" in sax] l_2ch = [sax for sax in files if "2ch" in sax] patient_result[tag] = [ disk_access.load_data_from_file(l_4ch[0]) if l_4ch else None, disk_access.load_data_from_file(l_2ch[0]) if l_2ch else None ] metadatas_result[tag] = [ load_clean_metadata(l_4ch[0]) if l_4ch else None, load_clean_metadata(l_2ch[0]) if l_2ch else None, None ] l = [sax for sax in files if "sax" in sax] metadatas_result[tag][2] = [load_clean_metadata(f) for f in l] elif tag.startswith("sliced:data:randomslices"): l = [sax for sax in files if "sax" in sax] nr_slices = result["input"][tag].shape[1] chosen_files = utils.pick_random(l, nr_slices) patient_result[tag] = [ disk_access.load_data_from_file(f) for f in chosen_files ] metadatas_result[tag] = [ load_clean_metadata(f) for f in chosen_files ] elif tag.startswith("sliced:data:sax:locations"): pass # will be filled in by sliced:data:sax elif tag.startswith("sliced:data:sax:distances"): pass # will be filled in by sliced:data:sax elif tag.startswith("sliced:data:sax:is_not_padded"): pass # will be filled in by sliced:data:sax elif tag.startswith("sliced:data:sax:distances"): pass # will be filled in by the next one elif tag.startswith("sliced:data:sax:distances"): pass # will be filled in by the next one elif tag.startswith("sliced:data:sax"): patient_result[tag] = [ disk_access.load_data_from_file(f) for f in files if "sax" in f ] metadatas_result[tag] = [ load_clean_metadata(f) for f in files if "sax" in f ] elif tag.startswith("sliced:data:shape"): patient_result[tag] = [ disk_access.load_data_from_file(f).shape for f in files ] metadatas_result[tag] = [ load_clean_metadata(f) for f in files if "sax" in f ] elif tag.startswith("sliced:data"): patient_result[tag] = [ disk_access.load_data_from_file(f) for f in files ] metadatas_result[tag] = [load_clean_metadata(f) for f in files] elif tag.startswith("area_per_pixel"): patient_result[ tag] = None # they are filled in in preprocessing elif tag.startswith("sliced:meta:all"): # get the key used in the pickle key = tag[len("slided:meta:all:"):] patient_result[tag] = [ disk_access.load_metadata_from_file(f)[0][key] for f in files ] elif tag.startswith("sliced:meta"): # get the key used in the pickle key = tag[len("slided:meta:"):] metadata_field = disk_access.load_metadata_from_file( files[0])[0][key] patient_result[tag] = metadata_field # add others when needed label_correction_function, classification_correction_function = preprocess_function( patient_result, result=result["input"], index=i, metadata=metadatas_result, testaug=True) if "classification_correction_function" in wanted_output_tags: result["output"]["classification_correction_function"][ i] = classification_correction_function # load the labels if "patients" in wanted_output_tags: result["output"]["patients"][i] = id if "slices" in wanted_output_tags: result["output"]["slices"][i] = slice_id # only read labels, when we actually have them if id in regular_labels[:, 0]: assert regular_labels[id - 1, 0] == id V_systole = label_correction_function(regular_labels[id - 1, 1]) V_diastole = label_correction_function(regular_labels[id - 1, 2]) if "systole" in wanted_output_tags: result["output"]["systole"][i][int(np.ceil(V_systole)):] = 1.0 if "diastole" in wanted_output_tags: result["output"]["diastole"][i][int(np.ceil(V_diastole) ):] = 1.0 if "average" in wanted_output_tags: result["output"]["average"][i][ int(np.ceil((V_diastole + V_systole) / 2.0)):] = 1.0 if "systole:onehot" in wanted_output_tags: result["output"]["systole:onehot"][i][int( np.ceil(V_systole))] = 1.0 if "diastole:onehot" in wanted_output_tags: result["output"]["diastole:onehot"][i][int( np.ceil(V_diastole))] = 1.0 if "systole:value" in wanted_output_tags: result["output"]["systole:value"][i] = V_systole if "diastole:value" in wanted_output_tags: result["output"]["diastole:value"][i] = V_diastole if "systole:class_weight" in wanted_output_tags: result["output"]["systole:class_weight"][ i] = utils.linear_weighted(V_systole) if "diastole:class_weight" in wanted_output_tags: result["output"]["diastole:class_weight"][ i] = utils.linear_weighted(V_diastole) else: if set != "test": raise Exception("unknown patient in train or validation set") # Check if any of the inputs or outputs are still empty! if hasattr(_config(), 'check_inputs') and _config().check_inputs: for key, value in itertools.chain(iter(result["input"].items()), iter(result["output"].items())): if key == "classification_correction_function": continue if not np.any(value): #there are only zeros in value raise Exception("there is an empty value at key %s" % key) if not np.isfinite( value).all(): #there are NaN's or infinites somewhere print(value) raise Exception("there is a NaN at key %s" % key) return result
def get_patient_data(indices, wanted_input_tags, wanted_output_tags, set="train", preprocess_function=None, testaug=False): """ return a dict with the desired data matched to the required tags :param wanted_data_tags: :return: """ def initialise_empty(): """Initialise empty chunk """ result = { "input": {}, "output": {}, } no_samples = _config().batch_size * _config().batches_per_chunk vector_size = (no_samples, ) matrix_size = (no_samples, 600) OUTPUT_DATA_SIZE_TYPE = { "systole": (matrix_size, "float32"), "diastole": (matrix_size, "float32"), "average": (matrix_size, "float32"), "systole:onehot": (matrix_size, "float32"), "diastole:onehot": (matrix_size, "float32"), "systole:class_weight": (matrix_size, "float32"), "diastole:class_weight": (matrix_size, "float32"), "systole:value": (vector_size, "float32"), "diastole:value": (vector_size, "float32"), "patients": (vector_size, "int32"), "slices": (vector_size, "int32"), "area_per_pixel": (no_samples, ), } for tag in wanted_output_tags: if tag in OUTPUT_DATA_SIZE_TYPE: size, dtype = OUTPUT_DATA_SIZE_TYPE[tag] result["output"][tag] = np.zeros(size, dtype=dtype) for tag in wanted_input_tags: if tag in _config().data_sizes: chunk_shape = list(_config().data_sizes[tag]) chunk_shape[0] = chunk_shape[0] * _config().batches_per_chunk chunk_shape = tuple(chunk_shape) result["input"][tag] = np.zeros(chunk_shape, dtype="float32") if "classification_correction_function" in wanted_output_tags: result["output"]["classification_correction_function"] = [lambda x:x] * no_samples return result result = initialise_empty() if set not in patient_folders: raise ValueError("Don't know the dataset %s" % set) folders = [ patient_folders[set][i] for i in indices if 0<=i<num_patients[set]] # Iterate over folders for i, folder in enumerate(folders): # find the id of the current patient in the folder name (=safer) id = _extract_id_from_path(folder) files = _in_folder(folder) patient_result = dict() metadatas_result = dict() # function for loading and cleaning metadata. Only use the first frame def load_clean_metadata(f): m = utils.clean_metadata(disk_access.load_metadata_from_file(f)[0]) pid = _extract_id_from_path(f) slicename = os.path.basename(f) _enhance_metadata(m, pid, slicename) return m # Iterate over input tags for tag in wanted_input_tags: if tag.startswith("sliced:data:singleslice"): if "4ch" in tag: l = [sax for sax in files if "4ch" in sax] elif "2ch" in tag: l = [sax for sax in files if "2ch" in sax] else: l = [sax for sax in files if "sax" in sax] if not l: if hasattr(_config(), 'check_inputs') and _config().check_inputs: print "Warning: patient %d has no images of this type" % id continue if "middle" in tag: # Sort sax files, based on the integer in their name l.sort(key=lambda f: int(re.findall("\d+", os.path.basename(f))[0])) f = l[len(l)/2] else: f = random.choice(l) patient_result[tag] = disk_access.load_data_from_file(f) metadatas_result[tag] = load_clean_metadata(f) slice_id = _extract_slice_id_from_path(f) if "difference" in tag: for j in xrange(patient_result[tag].shape[0]-1): patient_result[tag][j] -= patient_result[tag][j+1] patient_result[tag] = np.delete(patient_result[tag],-1,0) elif tag.startswith("sliced:data:chanzoom:4ch"): pass # done by the next one elif tag.startswith("sliced:data:chanzoom:2ch"): l_4ch = [sax for sax in files if "4ch" in sax] l_2ch = [sax for sax in files if "2ch" in sax] patient_result[tag] = [disk_access.load_data_from_file(l_4ch[0]) if l_4ch else None, disk_access.load_data_from_file(l_2ch[0]) if l_2ch else None] metadatas_result[tag] = [load_clean_metadata(l_4ch[0]) if l_4ch else None, load_clean_metadata(l_2ch[0]) if l_2ch else None, None] l = [sax for sax in files if "sax" in sax] metadatas_result[tag][2] = [load_clean_metadata(f) for f in l] elif tag.startswith("sliced:data:randomslices"): l = [sax for sax in files if "sax" in sax] nr_slices = result["input"][tag].shape[1] chosen_files = utils.pick_random(l, nr_slices) patient_result[tag] = [disk_access.load_data_from_file(f) for f in chosen_files] metadatas_result[tag] = [load_clean_metadata(f) for f in chosen_files] elif tag.startswith("sliced:data:sax:locations"): pass # will be filled in by sliced:data:sax elif tag.startswith("sliced:data:sax:distances"): pass # will be filled in by sliced:data:sax elif tag.startswith("sliced:data:sax:is_not_padded"): pass # will be filled in by sliced:data:sax elif tag.startswith("sliced:data:sax:distances"): pass # will be filled in by the next one elif tag.startswith("sliced:data:sax:distances"): pass # will be filled in by the next one elif tag.startswith("sliced:data:sax"): patient_result[tag] = [disk_access.load_data_from_file(f) for f in files if "sax" in f] metadatas_result[tag] = [load_clean_metadata(f) for f in files if "sax" in f] elif tag.startswith("sliced:data:shape"): patient_result[tag] = [disk_access.load_data_from_file(f).shape for f in files] metadatas_result[tag] = [load_clean_metadata(f) for f in files if "sax" in f] elif tag.startswith("sliced:data"): patient_result[tag] = [disk_access.load_data_from_file(f) for f in files] metadatas_result[tag] = [load_clean_metadata(f) for f in files] elif tag.startswith("area_per_pixel"): patient_result[tag] = None # they are filled in in preprocessing elif tag.startswith("sliced:meta:all"): # get the key used in the pickle key = tag[len("slided:meta:all:"):] patient_result[tag] = [disk_access.load_metadata_from_file(f)[0][key] for f in files] elif tag.startswith("sliced:meta"): # get the key used in the pickle key = tag[len("slided:meta:"):] metadata_field = disk_access.load_metadata_from_file(files[0])[0][key] patient_result[tag] = metadata_field # add others when needed label_correction_function, classification_correction_function = preprocess_function(patient_result, result=result["input"], index=i, metadata=metadatas_result, testaug=True) if "classification_correction_function" in wanted_output_tags: result["output"]["classification_correction_function"][i] = classification_correction_function # load the labels if "patients" in wanted_output_tags: result["output"]["patients"][i] = id if "slices" in wanted_output_tags: result["output"]["slices"][i] = slice_id # only read labels, when we actually have them if id in regular_labels[:, 0]: assert regular_labels[id-1, 0]==id V_systole = label_correction_function(regular_labels[id-1, 1]) V_diastole = label_correction_function(regular_labels[id-1, 2]) if "systole" in wanted_output_tags: result["output"]["systole"][i][int(np.ceil(V_systole)):] = 1.0 if "diastole" in wanted_output_tags: result["output"]["diastole"][i][int(np.ceil(V_diastole)):] = 1.0 if "average" in wanted_output_tags: result["output"]["average"][i][int(np.ceil((V_diastole + V_systole)/2.0)):] = 1.0 if "systole:onehot" in wanted_output_tags: result["output"]["systole:onehot"][i][int(np.ceil(V_systole))] = 1.0 if "diastole:onehot" in wanted_output_tags: result["output"]["diastole:onehot"][i][int(np.ceil(V_diastole))] = 1.0 if "systole:value" in wanted_output_tags: result["output"]["systole:value"][i] = V_systole if "diastole:value" in wanted_output_tags: result["output"]["diastole:value"][i] = V_diastole if "systole:class_weight" in wanted_output_tags: result["output"]["systole:class_weight"][i] = utils.linear_weighted(V_systole) if "diastole:class_weight" in wanted_output_tags: result["output"]["diastole:class_weight"][i] = utils.linear_weighted(V_diastole) else: if set!="test": raise Exception("unknown patient in train or validation set") # Check if any of the inputs or outputs are still empty! if hasattr(_config(), 'check_inputs') and _config().check_inputs: for key, value in itertools.chain(result["input"].iteritems(), result["output"].iteritems()): if key=="classification_correction_function": continue if not np.any(value): #there are only zeros in value raise Exception("there is an empty value at key %s" % key) if not np.isfinite(value).all(): #there are NaN's or infinites somewhere print value raise Exception("there is a NaN at key %s" % key) return result