count[index] = len(os.listdir(dir)) nsg = count[0] completAndPartiel = count[1] + count[2] if nsg != completAndPartiel: diff = abs(count[0] - count[1] - count[2]) print( "{} : {} {} manquant{}".format(imageDir, diff, 'nsg' if nsg < completAndPartiel else 'complet/partiel', 's' if diff > 1 else '')) totalDiff += diff print("Total : {}".format(totalDiff)) if __name__ == "__main__": dW.getInfoRawDataset('raw_dataset', verbose=True, adapter=None) # Creating masks and making per image directories dW.startWrapper('raw_dataset', 'temp_nephrology_dataset', deleteBaseCortexMasks=True, adapter=None) infoNephrologyDataset('temp_nephrology_dataset') checkNSG('temp_nephrology_dataset') # Sorting images to keep those that can be used to train cortex sortImages(datasetPath='temp_nephrology_dataset', createCortexDataset=True, cortexDatasetPath='nephrology_cortex_dataset', unusedDirPath='nephrology_dataset_unused') infoNephrologyDataset('nephrology_cortex_dataset') # Taking some images from the cortex dataset to make the validation cortex dataset createValDataset('nephrology_cortex_dataset', rename=True) infoNephrologyDataset('nephrology_cortex_dataset_train') infoNephrologyDataset('nephrology_cortex_dataset_val') if False: # Dividing main dataset in 1024*1024 divisions
def generateInflammationDataset( rawDataset: str, outputDataset="nephrology_inflammation_dataset", cleanBeforeStart=True, imageFormat='jpg', divisionSize=1024, overlap=0.33, separate="images", recreateValList=None, adapter: AnnotationAdapter = None): """ Generates datasets folder from a base directory, all paths are customizable, and it can also remove previous directories :param rawDataset: path to the base directory :param outputDataset: path to the output dataset :param cleanBeforeStart: if True, will delete previous directories that could still exist :param imageFormat: the image format to look for and to use :param divisionSize: size of the output images :param overlap: the least overlap between two divisions :param separate: if True, divisions of same image can be separated into training and val directories :param recreateValList: list of the images to use to recreate val dataset :param adapter: the adapter to use if given, else it will be chosen depending on the annotations found :return: None """ base_class = "cortex" if cleanBeforeStart: # Removing temp directories import shutil dirToDel = [ "temp_" + outputDataset, "temp_" + outputDataset + '_val', outputDataset, outputDataset + '_train', outputDataset + '_val', outputDataset + '_unused' ] for directory in dirToDel: if os.path.exists(directory): shutil.rmtree(directory, ignore_errors=True) # Creating masks and making per image directories dW.startWrapper(rawDataset, "temp_" + outputDataset, deleteBaseMasks=True, adapter=adapter, imageFormat=imageFormat, mode="inflammation") infoNephrologyDataset("temp_" + outputDataset, baseClass=base_class) # Sorting images to keep those that can be used sortImages(datasetPath="temp_" + outputDataset, unusedDirPath=outputDataset + '_unused', mode="main") recreateInfo = { "mode": "inflammation", "output_dataset": outputDataset, "clean_before_start": cleanBeforeStart, "image_format": imageFormat, "division_size": divisionSize, "min_overlap_part": overlap, "separate": separate, "val_dataset": [] } if separate == "div": # Dividing dataset dD.divideDataset("temp_" + outputDataset, outputDataset, squareSideLength=divisionSize, min_overlap_part=overlap, verbose=1) infoNephrologyDataset(outputDataset, baseClass=base_class) # Removing unusable images by moving them into a specific directory sortImages(outputDataset, unusedDirPath=outputDataset + '_unused') # Taking some images from the main dataset to make the validation dataset recreateInfo["val_dataset"] = createValDataset( outputDataset, rename=True, recreateInfo=recreateValList) else: # To avoid having divisions of same image to be dispatched in main and validation dataset # Removing unusable images by moving them into a specific directory if separate == "patient": recreateInfo["val_dataset"] = createValDatasetByPeople( rawDataset=rawDataset, datasetPath="temp_" + outputDataset, valDatasetPath='temp_' + outputDataset + '_val', nbPatientBiopsie=7, nbPatientNephrectomy=3, recreateInfo=recreateValList) else: # Taking some images from the main dataset to make the validation dataset recreateInfo["val_dataset"] = createValDataset( "temp_" + outputDataset, valDatasetPath='temp_' + outputDataset + '_val', rename=False, recreateInfo=recreateValList) # Dividing the main dataset after having separated images for the validation dataset # then removing unusable divisions dD.divideDataset("temp_" + outputDataset, outputDataset + '_train', squareSideLength=divisionSize, min_overlap_part=overlap, verbose=1) sortImages(outputDataset + '_train', unusedDirPath=outputDataset + '_unused') # Same thing with the validation dataset directly dD.divideDataset('temp_' + outputDataset + '_val', outputDataset + '_val', squareSideLength=divisionSize, min_overlap_part=overlap, verbose=1) sortImages(outputDataset + '_val', unusedDirPath=outputDataset + '_unused') infoNephrologyDataset(outputDataset + '_train', baseClass=base_class) infoNephrologyDataset(outputDataset + '_val', baseClass=base_class) if recreateValList is None or len(recreateValList) == 0: with open(f"dataset_{formatDate()}.json", 'w') as recreateFile: json.dump(recreateInfo, recreateFile, indent="\t") print("\nDataset made, nothing left to do")
def generateCortexDataset(rawDataset: str, outputDataset="nephrology_cortex_dataset", cleanBeforeStart=True, resize=(2048, 2048), overlap=0., separateDivInsteadOfImage=False, recreateValList=None, adapter: AnnotationAdapter = None): """ Generates datasets folder from a base directory, all paths are customizable, and it can also remove previous directories :param rawDataset: path to the base directory :param outputDataset: path to the output cortex dataset :param cleanBeforeStart: if True, will delete previous directories that could still exist :param resize: the size of the output images and masks before dividing :param overlap: the least overlap between two divisions :param separateDivInsteadOfImage: if True, divisions of same image can be separated into training and val directories :param recreateValList: list of the images to use to recreate cortex dataset :param adapter: the adapter to use if given, else it will be chosen depending on the annotations found :return: None """ recreateInfo = { "mode": "cortex", "output_dataset": outputDataset, "clean_before_start": cleanBeforeStart, "resize": list(resize), "separate": "div" if separateDivInsteadOfImage else "images", "min_overlap_part": overlap, "val_dataset": [] } # Removing former dataset directories if cleanBeforeStart: import shutil dirToDel = [ "temp_" + outputDataset, outputDataset, outputDataset + '_train', outputDataset + '_val' ] for directory in dirToDel: if os.path.exists(directory): shutil.rmtree(directory, ignore_errors=True) # Creating masks for cortices images dW.startWrapper(rawDataset, "temp_" + outputDataset, resize=resize, mode="cortex", adapter=adapter) if not separateDivInsteadOfImage: recreateInfo["val_dataset"] = createValDataset( "temp_" + outputDataset, valDatasetPath="temp_" + outputDataset + '_val', rename=True, valDatasetSizePart=0.05, valDatasetMinSize=10, recreateInfo=recreateValList) # If size is greater than 1024x1024, dataset must be divided if resize is not None and not resize[0] == resize[1] == 1024: if separateDivInsteadOfImage: divide = {"temp_" + outputDataset: outputDataset} else: divide = { "temp_" + outputDataset + '_val': outputDataset + '_val', "temp_" + outputDataset + '_train': outputDataset + '_train' } for inputPath, outputPath in divide.items(): dD.divideDataset(inputPath, outputPath, squareSideLength=1024, min_overlap_part=overlap, mode="cortex", verbose=1) if separateDivInsteadOfImage: # Creating val dataset by recreateInfo["val_dataset"] = createValDataset( outputDataset, valDatasetPath=outputDataset + '_val', rename=True, valDatasetSizePart=0.05, valDatasetMinSize=10, recreateInfo=recreateValList) for datasetPath in [outputDataset + '_train', outputDataset + '_val']: sortImages(datasetPath, outputDataset + '_unused', mode="cortex") infoNephrologyDataset(outputDataset + '_train') infoNephrologyDataset(outputDataset + '_val') if recreateValList is None or len(recreateValList) == 0: with open(f"dataset_cortex_{formatDate()}.json", 'w') as recreateFile: json.dump(recreateInfo, recreateFile, indent="\t") print("\nDataset made, nothing left to do")
def generateMESTCDataset(rawDataset: str, outputDataset="nephrology_mest_{mode}_dataset", cleanBeforeStart=True, mode="glom", imageFormat='jpg', divisionSize=1024, overlap=0.33, separate="images", recreateValList=None, adapter: AnnotationAdapter = None): """ Generates datasets folder from a base directory, all paths are customizable, and it can also remove previous directories :param rawDataset: path to the base directory :param outputDataset: path to the output dataset :param cleanBeforeStart: if True, will delete previous directories that could still exist :param mode: the mode to use : glom or main :param imageFormat: the image format to look for and to use :param divisionSize: size of the output images :param overlap: the least overlap between two divisions :param separate: if True, divisions of same image can be separated into training and val directories :param recreateValList: list of the images to use to recreate val dataset :param adapter: the adapter to use if given, else it will be chosen depending on the annotations found :return: None """ outputDataset = outputDataset.format(mode=mode) recreateInfo = { "mode": "mest", "submode": mode, "output_dataset": outputDataset, "clean_before_start": cleanBeforeStart, "image_format": imageFormat, "division_size": divisionSize, "min_overlap_part": separate, "val_dataset": [] } # Removing former dataset directories if cleanBeforeStart: import shutil dirToDel = [ "temp_" + outputDataset, "temp_" + outputDataset + '_train', "temp_" + outputDataset + '_val', outputDataset, outputDataset + '_train', outputDataset + '_val', outputDataset + '_unused' ] for directory in dirToDel: if os.path.exists(directory): shutil.rmtree(directory, ignore_errors=True) # Creating masks for cortices images dW.startWrapper(rawDataset, "temp_" + outputDataset, mode=f"mest_{mode}", adapter=adapter) if mode == "glom": if not separate == "div": recreateInfo["val_dataset"] = createValDataset( "temp_" + outputDataset, valDatasetPath="temp_" + outputDataset + '_val', rename=True, valDatasetSizePart=0.05, valDatasetMinSize=10, recreateInfo=recreateValList) for datasetPart in ["train", "val"]: dI.isolateClass(f"temp_{outputDataset}_{datasetPart}", f"{outputDataset}_{datasetPart}", 'nsg', image_size=divisionSize, imageFormat=imageFormat, verbose=3, silent=False) sortImages(f"{outputDataset}_{datasetPart}", f"{outputDataset}_unused", mode="mest_glom") else: dI.isolateClass("temp_" + outputDataset, outputDataset, 'nsg', image_size=divisionSize, imageFormat=imageFormat, verbose=3, silent=False) sortImages(outputDataset, f"{outputDataset}_unused", mode="mest_glom") recreateInfo["val_dataset"] = createValDataset( outputDataset, valDatasetPath=outputDataset + '_val', rename=True, valDatasetSizePart=0.05, valDatasetMinSize=10, recreateInfo=recreateValList) elif mode == "main": if separate in ["images", "patient"]: if separate == "images": recreateInfo["val_dataset"] = createValDataset( "temp_" + outputDataset, valDatasetPath="temp_" + outputDataset + '_val', rename=False, valDatasetSizePart=0.05, valDatasetMinSize=10, recreateInfo=recreateValList) else: recreateInfo["val_dataset"] = createValDatasetByPeople( rawDataset=rawDataset, datasetPath="temp_" + outputDataset, valDatasetPath='temp_' + outputDataset + '_val', nbPatientBiopsie=8, nbPatientNephrectomy=2, recreateInfo=recreateValList) # Dividing the dataset if separate == "div": divide = {"temp_" + outputDataset: outputDataset} else: divide = { "temp_" + outputDataset + '_val': outputDataset + '_val', "temp_" + outputDataset: outputDataset + '_train' } for inputPath, outputPath in divide.items(): dD.divideDataset(inputPath, outputPath, squareSideLength=divisionSize, min_overlap_part=overlap, mode=f"mest_{mode}", verbose=1) if separate == "div": # Creating val dataset by recreateInfo["val_dataset"] = createValDataset( outputDataset, valDatasetPath=outputDataset + '_val', rename=True, valDatasetSizePart=0.05, valDatasetMinSize=10, recreateInfo=recreateValList) for datasetPath in [outputDataset + '_train', outputDataset + '_val']: sortImages(datasetPath, outputDataset + '_unused', mode="mest_main") infoNephrologyDataset(outputDataset + '_train', baseClass='nsg' if mode == "glom" else "cortex") infoNephrologyDataset(outputDataset + '_val', baseClass="glom" if mode == "glom" else "cortex") if recreateValList is None or len(recreateValList) == 0: with open(f"dataset_mest_{mode}_{formatDate()}.json", 'w') as recreateFile: json.dump(recreateInfo, recreateFile, indent="\t") print("\nDataset made, nothing left to do")
def generateDataset(rawDataset='raw_dataset', tempDataset='temp_dataset', unusedDirPath='nephrology_dataset_unused', mainDataset='main_dataset', mainDatasetUnusedDirPath='main_dataset_unused', deleteBaseMasks=True, adapter: AnnotationAdapter = None, imageFormat="jpg", recreateValList=None, separateDivInsteadOfImage=False, separateByPatient=True, divisionSize=1024, minDivisionOverlapping=0.33, cleanBeforeStart=False): """ Generates datasets folder from a base directory, all paths are customizable, and it can also remove previous directories :param rawDataset: path to the base directory :param tempDataset: path to a temporary directory :param unusedDirPath: path to the unused files' directory :param mainDataset: path to the main dataset directory, used to also define main training and validation directories :param mainDatasetUnusedDirPath: path to unused files' directory of main dataset :param deleteBaseMasks: whether to delete base masks or not :param adapter: the adapter used to read annotations files, if None, will detect automatically which one to use :param imageFormat: the image format to use for the datasets :param recreateValList: list of images to use to recreate val dataset :param separateDivInsteadOfImage: if True, divisions of same image can be separated into training and val directories :param separateByPatient: if True and not separateDivInsteadOfImage, will create validation directory based on patient :param divisionSize: the size of a division, default is 1024 :param minDivisionOverlapping: the min overlapping between two divisions, default is 33% :param cleanBeforeStart: if True, will delete previous directories that could still exist :return: """ if cleanBeforeStart: # Removing temp directories import shutil dirToDel = [ tempDataset, unusedDirPath, 'temp_' + mainDataset + '_val', mainDataset + '_val', mainDataset + '_train' ] for directory in dirToDel: if os.path.exists(directory): shutil.rmtree(directory, ignore_errors=True) # Creating masks and making per image directories dW.startWrapper(rawDataset, tempDataset, deleteBaseMasks=deleteBaseMasks, adapter=adapter, imageFormat=imageFormat, mode="main") infoNephrologyDataset(tempDataset, baseClass='cortex') checkNSG(tempDataset) # Sorting images to keep those that can be used sortImages(datasetPath=tempDataset, unusedDirPath=unusedDirPath, mode="main") recreateInfo = { "mode": "main", "temp_dataset": tempDataset, "unused_dir_path": unusedDirPath, "main_dataset": mainDataset, "main_dataset_unused_path": mainDatasetUnusedDirPath, "delete_base_masks": deleteBaseMasks, "image_format": imageFormat, "separate": "div" if separateDivInsteadOfImage else ("patient" if separateByPatient else "images"), "division_size": divisionSize, "min_overlap_part": minDivisionOverlapping, "clean_before_start": cleanBeforeStart, "val_dataset": [] } if separateDivInsteadOfImage: # Dividing main dataset in 1024*1024 divisions dD.divideDataset(tempDataset, mainDataset, squareSideLength=divisionSize, min_overlap_part=minDivisionOverlapping, verbose=1) infoNephrologyDataset(mainDataset, baseClass='cortex') # If you want to keep all cortex files comment dW.cleanCortexDir() lines # If you want to check them and then delete them, comment these lines too and after checking use them # dW.cleanFusedClassDir(tempDataset, 'cortex') # dW.cleanFusedClassDir(mainDataset, 'cortex') # Removing unusable images by moving them into a specific directory sortImages(mainDataset, unusedDirPath=mainDatasetUnusedDirPath) # Taking some images from the main dataset to make the validation dataset recreateInfo["val_dataset"] = createValDataset( mainDataset, rename=True, recreateInfo=recreateValList) else: # To avoid having divisions of same image to be dispatched in main and validation dataset # Removing unusable images by moving them into a specific directory if separateByPatient: recreateInfo["val_dataset"] = createValDatasetByPeople( rawDataset=rawDataset, datasetPath=tempDataset, valDatasetPath='temp_' + mainDataset + '_val', nbPatientBiopsie=7, nbPatientNephrectomy=3, recreateInfo=recreateValList) else: # Taking some images from the main dataset to make the validation dataset recreateInfo["val_dataset"] = createValDataset( tempDataset, valDatasetPath='temp_' + mainDataset + '_val', rename=False, recreateInfo=recreateValList) # Dividing the main dataset after having separated images for the validation dataset # then removing unusable divisions dD.divideDataset(tempDataset, mainDataset + '_train', squareSideLength=divisionSize, min_overlap_part=minDivisionOverlapping, verbose=1) sortImages(mainDataset + '_train', unusedDirPath=mainDatasetUnusedDirPath) # Same thing with the validation dataset directly dD.divideDataset('temp_' + mainDataset + '_val', mainDataset + '_val', squareSideLength=divisionSize, min_overlap_part=minDivisionOverlapping, verbose=1) sortImages(mainDataset + '_val', unusedDirPath=mainDatasetUnusedDirPath) infoNephrologyDataset(mainDataset + '_train', baseClass='cortex') infoNephrologyDataset(mainDataset + '_val', baseClass='cortex') if recreateValList is None or len(recreateValList) == 0: with open(f"dataset_{formatDate()}.json", 'w') as recreateFile: json.dump(recreateInfo, recreateFile, indent="\t") print("\nDataset made, nothing left to do")