示例#1
0
 def load_clean_metadata(f):
     m = utils.clean_metadata(disk_access.load_metadata_from_file(f)[0])
     pid = _extract_id_from_path(f)
     slicename = os.path.basename(f)
     _enhance_metadata(m, pid, slicename)
     return m
示例#2
0
 def load_clean_metadata(f):
     m = utils.clean_metadata(disk_access.load_metadata_from_file(f)[0])
     pid = _extract_id_from_path(f)
     slicename = os.path.basename(f)
     _enhance_metadata(m, pid, slicename)
     return m
示例#3
0
def get_patient_data(indices,
                     wanted_input_tags,
                     wanted_output_tags,
                     set="train",
                     preprocess_function=None,
                     testaug=False):
    """
    return a dict with the desired data matched to the required tags
    :param wanted_data_tags:
    :return:
    """
    def initialise_empty():
        """Initialise empty chunk
        """
        result = {
            "input": {},
            "output": {},
        }

        no_samples = _config().batch_size * _config().batches_per_chunk
        vector_size = (no_samples, )
        matrix_size = (no_samples, 600)

        OUTPUT_DATA_SIZE_TYPE = {
            "systole": (matrix_size, "float32"),
            "diastole": (matrix_size, "float32"),
            "average": (matrix_size, "float32"),
            "systole:onehot": (matrix_size, "float32"),
            "diastole:onehot": (matrix_size, "float32"),
            "systole:class_weight": (matrix_size, "float32"),
            "diastole:class_weight": (matrix_size, "float32"),
            "systole:value": (vector_size, "float32"),
            "diastole:value": (vector_size, "float32"),
            "patients": (vector_size, "int32"),
            "slices": (vector_size, "int32"),
            "area_per_pixel": (no_samples, ),
        }

        for tag in wanted_output_tags:
            if tag in OUTPUT_DATA_SIZE_TYPE:
                size, dtype = OUTPUT_DATA_SIZE_TYPE[tag]
                result["output"][tag] = np.zeros(size, dtype=dtype)

        for tag in wanted_input_tags:
            if tag in _config().data_sizes:
                chunk_shape = list(_config().data_sizes[tag])
                chunk_shape[0] = chunk_shape[0] * _config().batches_per_chunk
                chunk_shape = tuple(chunk_shape)
                result["input"][tag] = np.zeros(chunk_shape, dtype="float32")

        if "classification_correction_function" in wanted_output_tags:
            result["output"]["classification_correction_function"] = [
                lambda x: x
            ] * no_samples

        return result

    result = initialise_empty()

    if set not in patient_folders:
        raise ValueError("Don't know the dataset %s" % set)
    folders = [
        patient_folders[set][i] for i in indices if 0 <= i < num_patients[set]
    ]

    # Iterate over folders
    for i, folder in enumerate(folders):
        # find the id of the current patient in the folder name (=safer)
        id = _extract_id_from_path(folder)

        files = _in_folder(folder)
        patient_result = dict()
        metadatas_result = dict()

        # function for loading and cleaning metadata. Only use the first frame
        def load_clean_metadata(f):
            m = utils.clean_metadata(disk_access.load_metadata_from_file(f)[0])
            pid = _extract_id_from_path(f)
            slicename = os.path.basename(f)
            _enhance_metadata(m, pid, slicename)
            return m

        # Iterate over input tags
        for tag in wanted_input_tags:
            if tag.startswith("sliced:data:singleslice"):
                if "4ch" in tag:
                    l = [sax for sax in files if "4ch" in sax]
                elif "2ch" in tag:
                    l = [sax for sax in files if "2ch" in sax]
                else:
                    l = [sax for sax in files if "sax" in sax]
                if not l:
                    if hasattr(_config(),
                               'check_inputs') and _config().check_inputs:
                        print(
                            "Warning: patient %d has no images of this type" %
                            id)
                    continue
                if "middle" in tag:
                    # Sort sax files, based on the integer in their name
                    l.sort(key=lambda f: int(
                        re.findall("\d+", os.path.basename(f))[0]))
                    f = l[len(l) / 2]
                else:
                    f = random.choice(l)
                patient_result[tag] = disk_access.load_data_from_file(f)
                metadatas_result[tag] = load_clean_metadata(f)
                slice_id = _extract_slice_id_from_path(f)
                if "difference" in tag:
                    for j in range(patient_result[tag].shape[0] - 1):
                        patient_result[tag][j] -= patient_result[tag][j + 1]
                    patient_result[tag] = np.delete(patient_result[tag], -1, 0)
            elif tag.startswith("sliced:data:chanzoom:4ch"):
                pass  # done by the next one
            elif tag.startswith("sliced:data:chanzoom:2ch"):
                l_4ch = [sax for sax in files if "4ch" in sax]
                l_2ch = [sax for sax in files if "2ch" in sax]
                patient_result[tag] = [
                    disk_access.load_data_from_file(l_4ch[0])
                    if l_4ch else None,
                    disk_access.load_data_from_file(l_2ch[0])
                    if l_2ch else None
                ]
                metadatas_result[tag] = [
                    load_clean_metadata(l_4ch[0]) if l_4ch else None,
                    load_clean_metadata(l_2ch[0]) if l_2ch else None, None
                ]

                l = [sax for sax in files if "sax" in sax]
                metadatas_result[tag][2] = [load_clean_metadata(f) for f in l]

            elif tag.startswith("sliced:data:randomslices"):
                l = [sax for sax in files if "sax" in sax]
                nr_slices = result["input"][tag].shape[1]
                chosen_files = utils.pick_random(l, nr_slices)
                patient_result[tag] = [
                    disk_access.load_data_from_file(f) for f in chosen_files
                ]
                metadatas_result[tag] = [
                    load_clean_metadata(f) for f in chosen_files
                ]

            elif tag.startswith("sliced:data:sax:locations"):
                pass  # will be filled in by sliced:data:sax

            elif tag.startswith("sliced:data:sax:distances"):
                pass  # will be filled in by sliced:data:sax

            elif tag.startswith("sliced:data:sax:is_not_padded"):
                pass  # will be filled in by sliced:data:sax

            elif tag.startswith("sliced:data:sax:distances"):
                pass  # will be filled in by the next one

            elif tag.startswith("sliced:data:sax:distances"):
                pass  # will be filled in by the next one

            elif tag.startswith("sliced:data:sax"):
                patient_result[tag] = [
                    disk_access.load_data_from_file(f) for f in files
                    if "sax" in f
                ]
                metadatas_result[tag] = [
                    load_clean_metadata(f) for f in files if "sax" in f
                ]

            elif tag.startswith("sliced:data:shape"):
                patient_result[tag] = [
                    disk_access.load_data_from_file(f).shape for f in files
                ]
                metadatas_result[tag] = [
                    load_clean_metadata(f) for f in files if "sax" in f
                ]

            elif tag.startswith("sliced:data"):
                patient_result[tag] = [
                    disk_access.load_data_from_file(f) for f in files
                ]
                metadatas_result[tag] = [load_clean_metadata(f) for f in files]

            elif tag.startswith("area_per_pixel"):
                patient_result[
                    tag] = None  # they are filled in in preprocessing

            elif tag.startswith("sliced:meta:all"):
                # get the key used in the pickle
                key = tag[len("slided:meta:all:"):]
                patient_result[tag] = [
                    disk_access.load_metadata_from_file(f)[0][key]
                    for f in files
                ]
            elif tag.startswith("sliced:meta"):
                # get the key used in the pickle
                key = tag[len("slided:meta:"):]
                metadata_field = disk_access.load_metadata_from_file(
                    files[0])[0][key]
                patient_result[tag] = metadata_field
            # add others when needed

        label_correction_function, classification_correction_function = preprocess_function(
            patient_result,
            result=result["input"],
            index=i,
            metadata=metadatas_result,
            testaug=True)

        if "classification_correction_function" in wanted_output_tags:
            result["output"]["classification_correction_function"][
                i] = classification_correction_function

        # load the labels
        if "patients" in wanted_output_tags:
            result["output"]["patients"][i] = id

        if "slices" in wanted_output_tags:
            result["output"]["slices"][i] = slice_id

        # only read labels, when we actually have them
        if id in regular_labels[:, 0]:
            assert regular_labels[id - 1, 0] == id
            V_systole = label_correction_function(regular_labels[id - 1, 1])
            V_diastole = label_correction_function(regular_labels[id - 1, 2])

            if "systole" in wanted_output_tags:
                result["output"]["systole"][i][int(np.ceil(V_systole)):] = 1.0
            if "diastole" in wanted_output_tags:
                result["output"]["diastole"][i][int(np.ceil(V_diastole)
                                                    ):] = 1.0
            if "average" in wanted_output_tags:
                result["output"]["average"][i][
                    int(np.ceil((V_diastole + V_systole) / 2.0)):] = 1.0

            if "systole:onehot" in wanted_output_tags:
                result["output"]["systole:onehot"][i][int(
                    np.ceil(V_systole))] = 1.0
            if "diastole:onehot" in wanted_output_tags:
                result["output"]["diastole:onehot"][i][int(
                    np.ceil(V_diastole))] = 1.0

            if "systole:value" in wanted_output_tags:
                result["output"]["systole:value"][i] = V_systole
            if "diastole:value" in wanted_output_tags:
                result["output"]["diastole:value"][i] = V_diastole

            if "systole:class_weight" in wanted_output_tags:
                result["output"]["systole:class_weight"][
                    i] = utils.linear_weighted(V_systole)
            if "diastole:class_weight" in wanted_output_tags:
                result["output"]["diastole:class_weight"][
                    i] = utils.linear_weighted(V_diastole)

        else:
            if set != "test":
                raise Exception("unknown patient in train or validation set")

    # Check if any of the inputs or outputs are still empty!
    if hasattr(_config(), 'check_inputs') and _config().check_inputs:
        for key, value in itertools.chain(iter(result["input"].items()),
                                          iter(result["output"].items())):
            if key == "classification_correction_function":
                continue
            if not np.any(value):  #there are only zeros in value
                raise Exception("there is an empty value at key %s" % key)
            if not np.isfinite(
                    value).all():  #there are NaN's or infinites somewhere
                print(value)
                raise Exception("there is a NaN at key %s" % key)

    return result
示例#4
0
def get_patient_data(indices, wanted_input_tags, wanted_output_tags,
                     set="train", preprocess_function=None, testaug=False):
    """
    return a dict with the desired data matched to the required tags
    :param wanted_data_tags:
    :return:
    """

    def initialise_empty():
        """Initialise empty chunk
        """
        result = {
            "input": {},
            "output": {},
        }

        no_samples = _config().batch_size * _config().batches_per_chunk
        vector_size = (no_samples, )
        matrix_size = (no_samples, 600)

        OUTPUT_DATA_SIZE_TYPE = {
            "systole": (matrix_size, "float32"),
            "diastole": (matrix_size, "float32"),
            "average": (matrix_size, "float32"),
            "systole:onehot": (matrix_size, "float32"),
            "diastole:onehot": (matrix_size, "float32"),
            "systole:class_weight": (matrix_size, "float32"),
            "diastole:class_weight": (matrix_size, "float32"),
            "systole:value": (vector_size, "float32"),
            "diastole:value": (vector_size, "float32"),
            "patients": (vector_size, "int32"),
            "slices": (vector_size, "int32"),
            "area_per_pixel": (no_samples, ),
        }

        for tag in wanted_output_tags:
            if tag in OUTPUT_DATA_SIZE_TYPE:
                size, dtype = OUTPUT_DATA_SIZE_TYPE[tag]
                result["output"][tag] = np.zeros(size, dtype=dtype)

        for tag in wanted_input_tags:
            if tag in _config().data_sizes:
                chunk_shape = list(_config().data_sizes[tag])
                chunk_shape[0] = chunk_shape[0] * _config().batches_per_chunk
                chunk_shape = tuple(chunk_shape)
                result["input"][tag] = np.zeros(chunk_shape, dtype="float32")

        if "classification_correction_function" in wanted_output_tags:
            result["output"]["classification_correction_function"] = [lambda x:x] * no_samples

        return result

    result = initialise_empty()

    if set not in patient_folders:
        raise ValueError("Don't know the dataset %s" % set)
    folders = [
        patient_folders[set][i] for i in indices if 0<=i<num_patients[set]]

    # Iterate over folders
    for i, folder in enumerate(folders):
        # find the id of the current patient in the folder name (=safer)
        id = _extract_id_from_path(folder)

        files = _in_folder(folder)
        patient_result = dict()
        metadatas_result = dict()
        # function for loading and cleaning metadata. Only use the first frame
        def load_clean_metadata(f):
            m = utils.clean_metadata(disk_access.load_metadata_from_file(f)[0])
            pid = _extract_id_from_path(f)
            slicename = os.path.basename(f)
            _enhance_metadata(m, pid, slicename)
            return m

        # Iterate over input tags
        for tag in wanted_input_tags:
            if tag.startswith("sliced:data:singleslice"):
                if "4ch" in tag:
                    l = [sax for sax in files if "4ch" in sax]
                elif  "2ch" in tag:
                    l = [sax for sax in files if "2ch" in sax]
                else:
                    l = [sax for sax in files if "sax" in sax]
                if not l:
                    if hasattr(_config(), 'check_inputs') and _config().check_inputs:
                        print "Warning: patient %d has no images of this type" % id
                    continue
                if "middle" in tag:
                    # Sort sax files, based on the integer in their name
                    l.sort(key=lambda f: int(re.findall("\d+", os.path.basename(f))[0]))
                    f = l[len(l)/2]
                else:
                    f = random.choice(l)
                patient_result[tag] = disk_access.load_data_from_file(f)
                metadatas_result[tag] = load_clean_metadata(f)
                slice_id = _extract_slice_id_from_path(f)
                if "difference" in tag:
                    for j in xrange(patient_result[tag].shape[0]-1):
                        patient_result[tag][j] -= patient_result[tag][j+1]
                    patient_result[tag] = np.delete(patient_result[tag],-1,0)
            elif tag.startswith("sliced:data:chanzoom:4ch"):
                pass # done by the next one
            elif tag.startswith("sliced:data:chanzoom:2ch"):
                l_4ch = [sax for sax in files if "4ch" in sax]
                l_2ch = [sax for sax in files if "2ch" in sax]
                patient_result[tag] = [disk_access.load_data_from_file(l_4ch[0]) if l_4ch else None,
                                       disk_access.load_data_from_file(l_2ch[0]) if l_2ch else None]
                metadatas_result[tag] = [load_clean_metadata(l_4ch[0]) if l_4ch else None,
                                         load_clean_metadata(l_2ch[0]) if l_2ch else None,
                                         None]


                l = [sax for sax in files if "sax" in sax]
                metadatas_result[tag][2] = [load_clean_metadata(f) for f in l]

            elif tag.startswith("sliced:data:randomslices"):
                l = [sax for sax in files if "sax" in sax]
                nr_slices = result["input"][tag].shape[1]
                chosen_files = utils.pick_random(l, nr_slices)
                patient_result[tag] = [disk_access.load_data_from_file(f) for f in chosen_files]
                metadatas_result[tag] = [load_clean_metadata(f) for f in chosen_files]

            elif tag.startswith("sliced:data:sax:locations"):
                pass  # will be filled in by sliced:data:sax

            elif tag.startswith("sliced:data:sax:distances"):
                pass  # will be filled in by sliced:data:sax

            elif tag.startswith("sliced:data:sax:is_not_padded"):
                pass  # will be filled in by sliced:data:sax

            elif tag.startswith("sliced:data:sax:distances"):
                pass  # will be filled in by the next one

            elif tag.startswith("sliced:data:sax:distances"):
                pass  # will be filled in by the next one

            elif tag.startswith("sliced:data:sax"):
                patient_result[tag] = [disk_access.load_data_from_file(f) for f in files if "sax" in f]
                metadatas_result[tag] = [load_clean_metadata(f) for f in files if "sax" in f]

            elif tag.startswith("sliced:data:shape"):
                patient_result[tag] = [disk_access.load_data_from_file(f).shape for f in files]
                metadatas_result[tag] = [load_clean_metadata(f) for f in files if "sax" in f]

            elif tag.startswith("sliced:data"):
                patient_result[tag] = [disk_access.load_data_from_file(f) for f in files]
                metadatas_result[tag] = [load_clean_metadata(f) for f in files]

            elif tag.startswith("area_per_pixel"):
                patient_result[tag] = None  # they are filled in in preprocessing

            elif tag.startswith("sliced:meta:all"):
                # get the key used in the pickle
                key = tag[len("slided:meta:all:"):]
                patient_result[tag] = [disk_access.load_metadata_from_file(f)[0][key] for f in files]
            elif tag.startswith("sliced:meta"):
                # get the key used in the pickle
                key = tag[len("slided:meta:"):]
                metadata_field = disk_access.load_metadata_from_file(files[0])[0][key]
                patient_result[tag] = metadata_field
            # add others when needed

        label_correction_function, classification_correction_function = preprocess_function(patient_result, result=result["input"], index=i, metadata=metadatas_result, testaug=True)

        if "classification_correction_function" in wanted_output_tags:
            result["output"]["classification_correction_function"][i] = classification_correction_function

        # load the labels
        if "patients" in wanted_output_tags:
            result["output"]["patients"][i] = id

        if "slices" in wanted_output_tags:
            result["output"]["slices"][i] = slice_id

        # only read labels, when we actually have them
        if id in regular_labels[:, 0]:
            assert regular_labels[id-1, 0]==id
            V_systole = label_correction_function(regular_labels[id-1, 1])
            V_diastole = label_correction_function(regular_labels[id-1, 2])

            if "systole" in wanted_output_tags:
                result["output"]["systole"][i][int(np.ceil(V_systole)):] = 1.0
            if "diastole" in wanted_output_tags:
                result["output"]["diastole"][i][int(np.ceil(V_diastole)):] = 1.0
            if "average" in wanted_output_tags:
                result["output"]["average"][i][int(np.ceil((V_diastole + V_systole)/2.0)):] = 1.0

            if "systole:onehot" in wanted_output_tags:
                result["output"]["systole:onehot"][i][int(np.ceil(V_systole))] = 1.0
            if "diastole:onehot" in wanted_output_tags:
                result["output"]["diastole:onehot"][i][int(np.ceil(V_diastole))] = 1.0

            if "systole:value" in wanted_output_tags:
                result["output"]["systole:value"][i] = V_systole
            if "diastole:value" in wanted_output_tags:
                result["output"]["diastole:value"][i] = V_diastole

            if "systole:class_weight" in wanted_output_tags:
                result["output"]["systole:class_weight"][i] = utils.linear_weighted(V_systole)
            if "diastole:class_weight" in wanted_output_tags:
                result["output"]["diastole:class_weight"][i] = utils.linear_weighted(V_diastole)

        else:
            if set!="test":
                raise Exception("unknown patient in train or validation set")


    # Check if any of the inputs or outputs are still empty!
    if hasattr(_config(), 'check_inputs') and _config().check_inputs:
        for key, value in itertools.chain(result["input"].iteritems(), result["output"].iteritems()):
            if key=="classification_correction_function":
                continue
            if not np.any(value): #there are only zeros in value
                raise Exception("there is an empty value at key %s" % key)
            if not np.isfinite(value).all(): #there are NaN's or infinites somewhere
                print value
                raise Exception("there is a NaN at key %s" % key)

    return result