示例#1
0
                count[index] = len(os.listdir(dir))
        nsg = count[0]
        completAndPartiel = count[1] + count[2]
        if nsg != completAndPartiel:
            diff = abs(count[0] - count[1] - count[2])
            print(
                "{} : {} {} manquant{}".format(imageDir, diff, 'nsg' if nsg < completAndPartiel else 'complet/partiel',
                                               's' if diff > 1 else ''))
            totalDiff += diff
    print("Total : {}".format(totalDiff))


if __name__ == "__main__":
    dW.getInfoRawDataset('raw_dataset', verbose=True, adapter=None)
    # Creating masks and making per image directories
    dW.startWrapper('raw_dataset', 'temp_nephrology_dataset', deleteBaseCortexMasks=True, adapter=None)
    infoNephrologyDataset('temp_nephrology_dataset')
    checkNSG('temp_nephrology_dataset')

    # Sorting images to keep those that can be used to train cortex
    sortImages(datasetPath='temp_nephrology_dataset',
               createCortexDataset=True, cortexDatasetPath='nephrology_cortex_dataset',
               unusedDirPath='nephrology_dataset_unused')
    infoNephrologyDataset('nephrology_cortex_dataset')
    # Taking some images from the cortex dataset to make the validation cortex dataset
    createValDataset('nephrology_cortex_dataset', rename=True)
    infoNephrologyDataset('nephrology_cortex_dataset_train')
    infoNephrologyDataset('nephrology_cortex_dataset_val')

    if False:
        # Dividing main dataset in 1024*1024 divisions
def generateInflammationDataset(
        rawDataset: str,
        outputDataset="nephrology_inflammation_dataset",
        cleanBeforeStart=True,
        imageFormat='jpg',
        divisionSize=1024,
        overlap=0.33,
        separate="images",
        recreateValList=None,
        adapter: AnnotationAdapter = None):
    """
    Generates datasets folder from a base directory, all paths are customizable, and it can also remove previous
    directories
    :param rawDataset: path to the base directory
    :param outputDataset: path to the output dataset
    :param cleanBeforeStart: if True, will delete previous directories that could still exist
    :param imageFormat: the image format to look for and to use
    :param divisionSize: size of the output images
    :param overlap: the least overlap between two divisions
    :param separate: if True, divisions of same image can be separated into training and val directories
    :param recreateValList: list of the images to use to recreate val dataset
    :param adapter: the adapter to use if given, else it will be chosen depending on the annotations found
    :return: None
    """
    base_class = "cortex"
    if cleanBeforeStart:
        # Removing temp directories
        import shutil
        dirToDel = [
            "temp_" + outputDataset, "temp_" + outputDataset + '_val',
            outputDataset, outputDataset + '_train', outputDataset + '_val',
            outputDataset + '_unused'
        ]
        for directory in dirToDel:
            if os.path.exists(directory):
                shutil.rmtree(directory, ignore_errors=True)

    # Creating masks and making per image directories
    dW.startWrapper(rawDataset,
                    "temp_" + outputDataset,
                    deleteBaseMasks=True,
                    adapter=adapter,
                    imageFormat=imageFormat,
                    mode="inflammation")
    infoNephrologyDataset("temp_" + outputDataset, baseClass=base_class)

    # Sorting images to keep those that can be used
    sortImages(datasetPath="temp_" + outputDataset,
               unusedDirPath=outputDataset + '_unused',
               mode="main")

    recreateInfo = {
        "mode": "inflammation",
        "output_dataset": outputDataset,
        "clean_before_start": cleanBeforeStart,
        "image_format": imageFormat,
        "division_size": divisionSize,
        "min_overlap_part": overlap,
        "separate": separate,
        "val_dataset": []
    }
    if separate == "div":
        # Dividing dataset
        dD.divideDataset("temp_" + outputDataset,
                         outputDataset,
                         squareSideLength=divisionSize,
                         min_overlap_part=overlap,
                         verbose=1)
        infoNephrologyDataset(outputDataset, baseClass=base_class)

        # Removing unusable images by moving them into a specific directory
        sortImages(outputDataset, unusedDirPath=outputDataset + '_unused')
        # Taking some images from the main dataset to make the validation dataset
        recreateInfo["val_dataset"] = createValDataset(
            outputDataset, rename=True, recreateInfo=recreateValList)
    else:  # To avoid having divisions of same image to be dispatched in main and validation dataset
        # Removing unusable images by moving them into a specific directory
        if separate == "patient":
            recreateInfo["val_dataset"] = createValDatasetByPeople(
                rawDataset=rawDataset,
                datasetPath="temp_" + outputDataset,
                valDatasetPath='temp_' + outputDataset + '_val',
                nbPatientBiopsie=7,
                nbPatientNephrectomy=3,
                recreateInfo=recreateValList)
        else:
            # Taking some images from the main dataset to make the validation dataset
            recreateInfo["val_dataset"] = createValDataset(
                "temp_" + outputDataset,
                valDatasetPath='temp_' + outputDataset + '_val',
                rename=False,
                recreateInfo=recreateValList)

        # Dividing the main dataset after having separated images for the validation dataset
        # then removing unusable divisions
        dD.divideDataset("temp_" + outputDataset,
                         outputDataset + '_train',
                         squareSideLength=divisionSize,
                         min_overlap_part=overlap,
                         verbose=1)
        sortImages(outputDataset + '_train',
                   unusedDirPath=outputDataset + '_unused')

        # Same thing with the validation dataset directly
        dD.divideDataset('temp_' + outputDataset + '_val',
                         outputDataset + '_val',
                         squareSideLength=divisionSize,
                         min_overlap_part=overlap,
                         verbose=1)
        sortImages(outputDataset + '_val',
                   unusedDirPath=outputDataset + '_unused')

    infoNephrologyDataset(outputDataset + '_train', baseClass=base_class)
    infoNephrologyDataset(outputDataset + '_val', baseClass=base_class)
    if recreateValList is None or len(recreateValList) == 0:
        with open(f"dataset_{formatDate()}.json", 'w') as recreateFile:
            json.dump(recreateInfo, recreateFile, indent="\t")
    print("\nDataset made, nothing left to do")
def generateCortexDataset(rawDataset: str,
                          outputDataset="nephrology_cortex_dataset",
                          cleanBeforeStart=True,
                          resize=(2048, 2048),
                          overlap=0.,
                          separateDivInsteadOfImage=False,
                          recreateValList=None,
                          adapter: AnnotationAdapter = None):
    """
    Generates datasets folder from a base directory, all paths are customizable, and it can also remove previous
    directories
    :param rawDataset: path to the base directory
    :param outputDataset: path to the output cortex dataset
    :param cleanBeforeStart: if True, will delete previous directories that could still exist
    :param resize: the size of the output images and masks before dividing
    :param overlap: the least overlap between two divisions
    :param separateDivInsteadOfImage: if True, divisions of same image can be separated into training and val
                                      directories
    :param recreateValList: list of the images to use to recreate cortex dataset
    :param adapter: the adapter to use if given, else it will be chosen depending on the annotations found
    :return: None
    """
    recreateInfo = {
        "mode": "cortex",
        "output_dataset": outputDataset,
        "clean_before_start": cleanBeforeStart,
        "resize": list(resize),
        "separate": "div" if separateDivInsteadOfImage else "images",
        "min_overlap_part": overlap,
        "val_dataset": []
    }
    # Removing former dataset directories
    if cleanBeforeStart:
        import shutil
        dirToDel = [
            "temp_" + outputDataset, outputDataset, outputDataset + '_train',
            outputDataset + '_val'
        ]
        for directory in dirToDel:
            if os.path.exists(directory):
                shutil.rmtree(directory, ignore_errors=True)
    # Creating masks for cortices images
    dW.startWrapper(rawDataset,
                    "temp_" + outputDataset,
                    resize=resize,
                    mode="cortex",
                    adapter=adapter)
    if not separateDivInsteadOfImage:
        recreateInfo["val_dataset"] = createValDataset(
            "temp_" + outputDataset,
            valDatasetPath="temp_" + outputDataset + '_val',
            rename=True,
            valDatasetSizePart=0.05,
            valDatasetMinSize=10,
            recreateInfo=recreateValList)
    # If size is greater than 1024x1024, dataset must be divided
    if resize is not None and not resize[0] == resize[1] == 1024:
        if separateDivInsteadOfImage:
            divide = {"temp_" + outputDataset: outputDataset}
        else:
            divide = {
                "temp_" + outputDataset + '_val': outputDataset + '_val',
                "temp_" + outputDataset + '_train': outputDataset + '_train'
            }
        for inputPath, outputPath in divide.items():
            dD.divideDataset(inputPath,
                             outputPath,
                             squareSideLength=1024,
                             min_overlap_part=overlap,
                             mode="cortex",
                             verbose=1)
    if separateDivInsteadOfImage:
        # Creating val dataset by
        recreateInfo["val_dataset"] = createValDataset(
            outputDataset,
            valDatasetPath=outputDataset + '_val',
            rename=True,
            valDatasetSizePart=0.05,
            valDatasetMinSize=10,
            recreateInfo=recreateValList)
    for datasetPath in [outputDataset + '_train', outputDataset + '_val']:
        sortImages(datasetPath, outputDataset + '_unused', mode="cortex")
    infoNephrologyDataset(outputDataset + '_train')
    infoNephrologyDataset(outputDataset + '_val')
    if recreateValList is None or len(recreateValList) == 0:
        with open(f"dataset_cortex_{formatDate()}.json", 'w') as recreateFile:
            json.dump(recreateInfo, recreateFile, indent="\t")
    print("\nDataset made, nothing left to do")
def generateMESTCDataset(rawDataset: str,
                         outputDataset="nephrology_mest_{mode}_dataset",
                         cleanBeforeStart=True,
                         mode="glom",
                         imageFormat='jpg',
                         divisionSize=1024,
                         overlap=0.33,
                         separate="images",
                         recreateValList=None,
                         adapter: AnnotationAdapter = None):
    """
    Generates datasets folder from a base directory, all paths are customizable, and it can also remove previous
    directories
    :param rawDataset: path to the base directory
    :param outputDataset: path to the output dataset
    :param cleanBeforeStart: if True, will delete previous directories that could still exist
    :param mode: the mode to use : glom or main
    :param imageFormat: the image format to look for and to use
    :param divisionSize: size of the output images
    :param overlap: the least overlap between two divisions
    :param separate: if True, divisions of same image can be separated into training and val directories
    :param recreateValList: list of the images to use to recreate val dataset
    :param adapter: the adapter to use if given, else it will be chosen depending on the annotations found
    :return: None
    """
    outputDataset = outputDataset.format(mode=mode)
    recreateInfo = {
        "mode": "mest",
        "submode": mode,
        "output_dataset": outputDataset,
        "clean_before_start": cleanBeforeStart,
        "image_format": imageFormat,
        "division_size": divisionSize,
        "min_overlap_part": separate,
        "val_dataset": []
    }
    # Removing former dataset directories
    if cleanBeforeStart:
        import shutil
        dirToDel = [
            "temp_" + outputDataset, "temp_" + outputDataset + '_train',
            "temp_" + outputDataset + '_val', outputDataset,
            outputDataset + '_train', outputDataset + '_val',
            outputDataset + '_unused'
        ]
        for directory in dirToDel:
            if os.path.exists(directory):
                shutil.rmtree(directory, ignore_errors=True)
    # Creating masks for cortices images
    dW.startWrapper(rawDataset,
                    "temp_" + outputDataset,
                    mode=f"mest_{mode}",
                    adapter=adapter)
    if mode == "glom":
        if not separate == "div":
            recreateInfo["val_dataset"] = createValDataset(
                "temp_" + outputDataset,
                valDatasetPath="temp_" + outputDataset + '_val',
                rename=True,
                valDatasetSizePart=0.05,
                valDatasetMinSize=10,
                recreateInfo=recreateValList)
            for datasetPart in ["train", "val"]:
                dI.isolateClass(f"temp_{outputDataset}_{datasetPart}",
                                f"{outputDataset}_{datasetPart}",
                                'nsg',
                                image_size=divisionSize,
                                imageFormat=imageFormat,
                                verbose=3,
                                silent=False)
                sortImages(f"{outputDataset}_{datasetPart}",
                           f"{outputDataset}_unused",
                           mode="mest_glom")
        else:
            dI.isolateClass("temp_" + outputDataset,
                            outputDataset,
                            'nsg',
                            image_size=divisionSize,
                            imageFormat=imageFormat,
                            verbose=3,
                            silent=False)
            sortImages(outputDataset,
                       f"{outputDataset}_unused",
                       mode="mest_glom")
            recreateInfo["val_dataset"] = createValDataset(
                outputDataset,
                valDatasetPath=outputDataset + '_val',
                rename=True,
                valDatasetSizePart=0.05,
                valDatasetMinSize=10,
                recreateInfo=recreateValList)
    elif mode == "main":
        if separate in ["images", "patient"]:
            if separate == "images":
                recreateInfo["val_dataset"] = createValDataset(
                    "temp_" + outputDataset,
                    valDatasetPath="temp_" + outputDataset + '_val',
                    rename=False,
                    valDatasetSizePart=0.05,
                    valDatasetMinSize=10,
                    recreateInfo=recreateValList)
            else:
                recreateInfo["val_dataset"] = createValDatasetByPeople(
                    rawDataset=rawDataset,
                    datasetPath="temp_" + outputDataset,
                    valDatasetPath='temp_' + outputDataset + '_val',
                    nbPatientBiopsie=8,
                    nbPatientNephrectomy=2,
                    recreateInfo=recreateValList)
        # Dividing the dataset
        if separate == "div":
            divide = {"temp_" + outputDataset: outputDataset}
        else:
            divide = {
                "temp_" + outputDataset + '_val': outputDataset + '_val',
                "temp_" + outputDataset: outputDataset + '_train'
            }
        for inputPath, outputPath in divide.items():
            dD.divideDataset(inputPath,
                             outputPath,
                             squareSideLength=divisionSize,
                             min_overlap_part=overlap,
                             mode=f"mest_{mode}",
                             verbose=1)

        if separate == "div":
            # Creating val dataset by
            recreateInfo["val_dataset"] = createValDataset(
                outputDataset,
                valDatasetPath=outputDataset + '_val',
                rename=True,
                valDatasetSizePart=0.05,
                valDatasetMinSize=10,
                recreateInfo=recreateValList)
        for datasetPath in [outputDataset + '_train', outputDataset + '_val']:
            sortImages(datasetPath,
                       outputDataset + '_unused',
                       mode="mest_main")

    infoNephrologyDataset(outputDataset + '_train',
                          baseClass='nsg' if mode == "glom" else "cortex")
    infoNephrologyDataset(outputDataset + '_val',
                          baseClass="glom" if mode == "glom" else "cortex")
    if recreateValList is None or len(recreateValList) == 0:
        with open(f"dataset_mest_{mode}_{formatDate()}.json",
                  'w') as recreateFile:
            json.dump(recreateInfo, recreateFile, indent="\t")
    print("\nDataset made, nothing left to do")
def generateDataset(rawDataset='raw_dataset',
                    tempDataset='temp_dataset',
                    unusedDirPath='nephrology_dataset_unused',
                    mainDataset='main_dataset',
                    mainDatasetUnusedDirPath='main_dataset_unused',
                    deleteBaseMasks=True,
                    adapter: AnnotationAdapter = None,
                    imageFormat="jpg",
                    recreateValList=None,
                    separateDivInsteadOfImage=False,
                    separateByPatient=True,
                    divisionSize=1024,
                    minDivisionOverlapping=0.33,
                    cleanBeforeStart=False):
    """
    Generates datasets folder from a base directory, all paths are customizable, and it can also remove previous
    directories
    :param rawDataset: path to the base directory
    :param tempDataset: path to a temporary directory
    :param unusedDirPath: path to the unused files' directory
    :param mainDataset: path to the main dataset directory, used to also define main training and validation directories
    :param mainDatasetUnusedDirPath: path to unused files' directory of main dataset
    :param deleteBaseMasks: whether to delete base masks or not
    :param adapter: the adapter used to read annotations files, if None, will detect automatically which one to use
    :param imageFormat: the image format to use for the datasets
    :param recreateValList: list of images to use to recreate val dataset
    :param separateDivInsteadOfImage: if True, divisions of same image can be separated into training and val
                                      directories
    :param separateByPatient: if True and not separateDivInsteadOfImage, will create validation directory based on
                              patient
    :param divisionSize: the size of a division, default is 1024
    :param minDivisionOverlapping: the min overlapping between two divisions, default is 33%
    :param cleanBeforeStart: if True, will delete previous directories that could still exist
    :return:
    """
    if cleanBeforeStart:
        # Removing temp directories
        import shutil
        dirToDel = [
            tempDataset, unusedDirPath, 'temp_' + mainDataset + '_val',
            mainDataset + '_val', mainDataset + '_train'
        ]
        for directory in dirToDel:
            if os.path.exists(directory):
                shutil.rmtree(directory, ignore_errors=True)

    # Creating masks and making per image directories
    dW.startWrapper(rawDataset,
                    tempDataset,
                    deleteBaseMasks=deleteBaseMasks,
                    adapter=adapter,
                    imageFormat=imageFormat,
                    mode="main")
    infoNephrologyDataset(tempDataset, baseClass='cortex')
    checkNSG(tempDataset)

    # Sorting images to keep those that can be used
    sortImages(datasetPath=tempDataset,
               unusedDirPath=unusedDirPath,
               mode="main")

    recreateInfo = {
        "mode":
        "main",
        "temp_dataset":
        tempDataset,
        "unused_dir_path":
        unusedDirPath,
        "main_dataset":
        mainDataset,
        "main_dataset_unused_path":
        mainDatasetUnusedDirPath,
        "delete_base_masks":
        deleteBaseMasks,
        "image_format":
        imageFormat,
        "separate":
        "div" if separateDivInsteadOfImage else
        ("patient" if separateByPatient else "images"),
        "division_size":
        divisionSize,
        "min_overlap_part":
        minDivisionOverlapping,
        "clean_before_start":
        cleanBeforeStart,
        "val_dataset": []
    }
    if separateDivInsteadOfImage:
        # Dividing main dataset in 1024*1024 divisions
        dD.divideDataset(tempDataset,
                         mainDataset,
                         squareSideLength=divisionSize,
                         min_overlap_part=minDivisionOverlapping,
                         verbose=1)
        infoNephrologyDataset(mainDataset, baseClass='cortex')

        # If you want to keep all cortex files comment dW.cleanCortexDir() lines
        # If you want to check them and then delete them, comment these lines too and after checking use them
        # dW.cleanFusedClassDir(tempDataset, 'cortex')
        # dW.cleanFusedClassDir(mainDataset, 'cortex')

        # Removing unusable images by moving them into a specific directory
        sortImages(mainDataset, unusedDirPath=mainDatasetUnusedDirPath)
        # Taking some images from the main dataset to make the validation dataset
        recreateInfo["val_dataset"] = createValDataset(
            mainDataset, rename=True, recreateInfo=recreateValList)
    else:  # To avoid having divisions of same image to be dispatched in main and validation dataset
        # Removing unusable images by moving them into a specific directory
        if separateByPatient:
            recreateInfo["val_dataset"] = createValDatasetByPeople(
                rawDataset=rawDataset,
                datasetPath=tempDataset,
                valDatasetPath='temp_' + mainDataset + '_val',
                nbPatientBiopsie=7,
                nbPatientNephrectomy=3,
                recreateInfo=recreateValList)
        else:
            # Taking some images from the main dataset to make the validation dataset
            recreateInfo["val_dataset"] = createValDataset(
                tempDataset,
                valDatasetPath='temp_' + mainDataset + '_val',
                rename=False,
                recreateInfo=recreateValList)

        # Dividing the main dataset after having separated images for the validation dataset
        # then removing unusable divisions
        dD.divideDataset(tempDataset,
                         mainDataset + '_train',
                         squareSideLength=divisionSize,
                         min_overlap_part=minDivisionOverlapping,
                         verbose=1)
        sortImages(mainDataset + '_train',
                   unusedDirPath=mainDatasetUnusedDirPath)

        # Same thing with the validation dataset directly
        dD.divideDataset('temp_' + mainDataset + '_val',
                         mainDataset + '_val',
                         squareSideLength=divisionSize,
                         min_overlap_part=minDivisionOverlapping,
                         verbose=1)
        sortImages(mainDataset + '_val',
                   unusedDirPath=mainDatasetUnusedDirPath)

    infoNephrologyDataset(mainDataset + '_train', baseClass='cortex')
    infoNephrologyDataset(mainDataset + '_val', baseClass='cortex')
    if recreateValList is None or len(recreateValList) == 0:
        with open(f"dataset_{formatDate()}.json", 'w') as recreateFile:
            json.dump(recreateInfo, recreateFile, indent="\t")
    print("\nDataset made, nothing left to do")