def get_labels(datapath, dataset, tile_size): """Extract labels from the `dataset` glossary, according to the preprocessed version of the dataset Parameters ---------- datapath : str Path of the data on the file system dataset : str Name of the dataset tile_size : int Size of preprocessed images, in pixels Returns ------- list List of dictionnaries that describes the dataset labels """ prepro_folder = utils.prepare_preprocessed_folder( datapath, dataset, tile_size, ) if os.path.isfile(prepro_folder["testing_config"]): test_config = utils.read_config(prepro_folder["testing_config"]) else: raise ValueError(("There is no testing data with the given " "parameters. Please generate a valid dataset " "before calling the program.")) return [l for l in test_config["labels"] if l["is_evaluate"]]
def test_preprocessed_folder(datapath_repo): """Test the creation of the preprocessed data repositories, by checking the full expected tree, *i.e.* considering training, validation and testing repositories within an instance-specific folder, and images and labels repositories wihtin each of these subrepositories """ datapath = str(datapath_repo) dataset = "shapes" image_size = 64 aggregate = "full" prepare_preprocessed_folder(datapath, dataset, image_size, aggregate) assert os.path.isdir(os.path.join(datapath, dataset, "preprocessed")) assert os.path.isdir( os.path.join(datapath, dataset, "preprocessed", str(image_size) + "_" + aggregate)) assert os.path.isdir( os.path.join(datapath, dataset, "preprocessed", str(image_size) + "_" + aggregate, "training")) assert os.path.isdir( os.path.join(datapath, dataset, "preprocessed", str(image_size) + "_" + aggregate, "training", "images")) assert os.path.isdir( os.path.join(datapath, dataset, "preprocessed", str(image_size) + "_" + aggregate, "training", "labels")) assert os.path.isdir( os.path.join(datapath, dataset, "preprocessed", str(image_size) + "_" + aggregate, "validation")) assert os.path.isdir( os.path.join(datapath, dataset, "preprocessed", str(image_size) + "_" + aggregate, "validation", "images")) assert os.path.isdir( os.path.join(datapath, dataset, "preprocessed", str(image_size) + "_" + aggregate, "validation", "labels")) assert os.path.isdir( os.path.join(datapath, dataset, "preprocessed", str(image_size) + "_" + aggregate, "testing")) assert os.path.isdir( os.path.join(datapath, dataset, "preprocessed", str(image_size) + "_" + aggregate, "testing", "images"))
def main(args): # Grid search model_output = [] for batch_size in args.batch_size: logger.info("Generating data with batch of %s images...", batch_size) # Data generator building prepro_folder = utils.prepare_preprocessed_folder( args.datapath, args.dataset, args.image_size) nb_labels, train_gen, valid_gen = get_data( prepro_folder, args.dataset, args.model, args.image_size, batch_size, ) for parameters in itertools.product( args.dropout, args.network, args.learning_rate, args.learning_rate_decay, ): logger.info("Instance: %s", utils.list_to_str(parameters)) # Data path and repository management dropout, network, learning_rate, learning_rate_decay = parameters instance_args = [ args.name, args.image_size, network, batch_size, dropout, learning_rate, learning_rate_decay, ] instance_name = utils.list_to_str(instance_args, "_") output_folder = utils.prepare_output_folder( args.datapath, args.dataset, args.model, instance_name) # Model running model_output.append( run_model(train_gen, valid_gen, args.model, output_folder, instance_name, args.image_size, nb_labels, args.nb_epochs, args.nb_training_image, args.nb_validation_image, batch_size, *parameters)) logger.info("Instance result: %s", model_output[-1]) # Recover best instance starting from validation accuracy best_instance = max(model_output, key=lambda x: x["val_acc"]) # Save best model output_folder = utils.prepare_output_folder(args.datapath, args.dataset, args.model) instance_name = os.path.join( output_folder, "best-{}-" + str(args.image_size) + ".{}", ) best_instance["model"].save(instance_name.format("model", "h5")) with open(instance_name.format("instance", "json"), "w") as fobj: json.dump( { key: best_instance[key] for key in best_instance if key != "model" }, fobj, ) backend.clear_session()
def main(args): # Data path and repository management input_folder = utils.prepare_input_folder(args.datapath, args.dataset) prepro_folder = utils.prepare_preprocessed_folder(args.datapath, args.dataset, args.image_size) # Dataset creation if args.dataset == "mapillary": config_path = os.path.join(input_folder, "config_aggregate.json") train_dataset = MapillaryDataset(args.image_size, config_path) validation_dataset = MapillaryDataset(args.image_size, config_path) test_dataset = MapillaryDataset(args.image_size, config_path) elif args.dataset == "shapes": train_dataset = ShapeDataset(args.image_size) validation_dataset = ShapeDataset(args.image_size) test_dataset = ShapeDataset(args.image_size) os.makedirs(os.path.join(prepro_folder["testing"], "labels"), exist_ok=True) elif args.dataset == "aerial": train_dataset = AerialDataset(args.image_size) validation_dataset = AerialDataset(args.image_size) test_dataset = AerialDataset(args.image_size) elif args.dataset == "tanzania": train_dataset = TanzaniaDataset(args.image_size) validation_dataset = TanzaniaDataset(args.image_size) test_dataset = TanzaniaDataset(args.image_size) else: logger.error( "Unsupported dataset type. Please choose amongst %s", AVAILABLE_DATASETS, ) sys.exit(1) # Dataset populating/loading # (depends on the existence of a specification file) if args.nb_training_image > 0: if os.path.isfile(prepro_folder["training_config"]): train_dataset.load(prepro_folder["training_config"], args.nb_training_image) else: logger.info(( "No existing configuration file for this dataset. " "Create %s.", prepro_folder["training_config"], )) input_image_dir = os.path.join(input_folder, "training") train_dataset.populate( prepro_folder["training"], input_image_dir, nb_images=args.nb_training_image, nb_processes=int(config.get("running", "processes")), ) train_dataset.save(prepro_folder["training_config"]) if args.nb_validation_image > 0: if os.path.isfile(prepro_folder["validation_config"]): validation_dataset.load(prepro_folder["validation_config"], args.nb_validation_image) else: logger.info(( "No existing configuration file for this dataset. " "Create %s.", prepro_folder["validation_config"], )) input_image_dir = os.path.join(input_folder, "validation") validation_dataset.populate( prepro_folder["validation"], input_image_dir, nb_images=args.nb_validation_image, nb_processes=int(config.get("running", "processes")), ) validation_dataset.save(prepro_folder["validation_config"]) if args.nb_testing_image > 0: if os.path.isfile(prepro_folder["testing_config"]): test_dataset.load(prepro_folder["testing_config"], args.nb_testing_image) else: logger.info(( "No existing configuration file for this dataset. " "Create %s.", prepro_folder["testing_config"], )) input_image_dir = os.path.join(input_folder, "testing") test_dataset.populate( prepro_folder["testing"], input_image_dir, nb_images=args.nb_testing_image, labelling=False, nb_processes=int(config.get("running", "processes")), ) test_dataset.save(prepro_folder["testing_config"]) glossary = pd.DataFrame(train_dataset.labels) glossary["popularity"] = train_dataset.get_label_popularity() logger.info("Data glossary:\n%s", glossary) sys.exit(0)
# Parse command-line arguments parser = argparse.ArgumentParser(description=("Convolutional Neural Netw" "ork on street-scene images")) parser = add_instance_arguments(parser) parser = add_hyperparameters(parser) parser = add_training_arguments(parser) args = parser.parse_args() # Data path and repository management aggregate_value = "full" if not args.aggregate_label else "aggregated" instance_args = [args.name, args.image_size, args.network, args.batch_size, aggregate_value, args.dropout, args.learning_rate, args.learning_rate_decay] instance_name = utils.list_to_str(instance_args, "_") prepro_folder = utils.prepare_preprocessed_folder(args.datapath, args.dataset, args.image_size, aggregate_value) # Data gathering if (os.path.isfile(prepro_folder["training_config"]) and os.path.isfile(prepro_folder["validation_config"]) and os.path.isfile(prepro_folder["testing_config"])): train_config = utils.read_config(prepro_folder["training_config"]) label_ids = [x['id'] for x in train_config['labels'] if x['is_evaluate']] train_generator = generator.create_generator( args.dataset, args.model, prepro_folder["training"], args.image_size, args.batch_size, label_ids, seed=SEED)
def main(args): # Data path and repository management input_folder = utils.prepare_input_folder(args.datapath, args.dataset) prepro_folder = utils.prepare_preprocessed_folder(args.datapath, args.dataset, args.image_size) if (args.dataset in GEOGRAPHIC_DATASETS and (args.nb_training_image > 0 or args.nb_validation_image > 0) and args.nb_tiles_per_image is None): raise ValueError( "The amount of tiles per image must be specified for " f"the {args.dataset} dataset, if training and/or validation images " "are required. See 'deepo datagen -h' for more details.") # Dataset creation if args.dataset == "mapillary": config_path = os.path.join(input_folder, "config_aggregate.json") train_dataset = MapillaryDataset(args.image_size, config_path) validation_dataset = MapillaryDataset(args.image_size, config_path) test_dataset = MapillaryDataset(args.image_size, config_path) elif args.dataset == "shapes": train_dataset = ShapeDataset(args.image_size) validation_dataset = ShapeDataset(args.image_size) test_dataset = ShapeDataset(args.image_size) os.makedirs(os.path.join(prepro_folder["testing"], "labels"), exist_ok=True) elif args.dataset == "aerial": train_dataset = AerialDataset(args.image_size) validation_dataset = AerialDataset(args.image_size) test_dataset = AerialDataset(args.image_size) elif args.dataset == "tanzania": train_dataset = TanzaniaDataset(args.image_size) validation_dataset = TanzaniaDataset(args.image_size) test_dataset = TanzaniaDataset(args.image_size) else: raise ValueError( f"Unsupported dataset type. Please choose amongst {AVAILABLE_DATASETS}" ) # Dataset populating/loading # (depends on the existence of a specification file) if args.nb_training_image > 0: if os.path.isfile(prepro_folder["training_config"]): train_dataset.load(prepro_folder["training_config"], args.nb_training_image) else: logger.info( "No existing configuration file for this dataset. Create %s.", prepro_folder["training_config"], ) input_image_dir = os.path.join(input_folder, "training") train_dataset.populate( prepro_folder["training"], input_image_dir, nb_images=args.nb_training_image, nb_processes=int(config.get("running", "processes")), nb_tiles_per_image=args.nb_tiles_per_image, ) train_dataset.save(prepro_folder["training_config"]) if args.nb_validation_image > 0: if os.path.isfile(prepro_folder["validation_config"]): validation_dataset.load(prepro_folder["validation_config"], args.nb_validation_image) else: logger.info( "No existing configuration file for this dataset. Create %s.", prepro_folder["validation_config"], ) input_image_dir = os.path.join(input_folder, "validation") validation_dataset.populate( prepro_folder["validation"], input_image_dir, nb_images=args.nb_validation_image, nb_processes=int(config.get("running", "processes")), nb_tiles_per_image=args.nb_tiles_per_image, ) validation_dataset.save(prepro_folder["validation_config"]) if args.nb_testing_image > 0: if os.path.isfile(prepro_folder["testing_config"]): test_dataset.load(prepro_folder["testing_config"], args.nb_testing_image) else: logger.info( "No existing configuration file for this dataset. Create %s.", prepro_folder["testing_config"], ) input_image_dir = os.path.join(input_folder, "testing") test_dataset.populate( prepro_folder["testing"], input_image_dir, nb_images=args.nb_testing_image, labelling=False, nb_processes=int(config.get("running", "processes")), ) test_dataset.save(prepro_folder["testing_config"]) glossary = pd.DataFrame(train_dataset.labels) glossary["popularity"] = train_dataset.get_label_popularity() logger.info("Data glossary:\n%s", glossary)
def predict( filenames, dataset, problem, datapath="./data", name=None, network=None, batch_size=None, dropout=None, learning_rate=None, learning_rate_decay=None, output_dir="/tmp/deeposlandia/predicted", ): """Make label prediction on image indicated by ̀filename`, according to considered `problem` Parameters ---------- filenames : str Name of the image files on the file system dataset : str Name of the dataset problem : str Name of the considered model, either `featdet` or `semseg` datapath : str Relative path of dataset repository name : str Name of the saved network network : str Name of the chosen architecture, either `simple`, `vgg` or `inception` batch_size : integer Batch size used for training the model dropout : float Dropout rate used for training the model learning_rate : float Learning rate used for training the model learning_rate_decay : float Learning rate decay used for training the model output_dir : str Path of the output directory, where labelled images will be stored (useful only if `problem=semantic_segmentation`) Returns ------- dict Double predictions (between 0 and 1, acts as percentages) regarding each labels """ # `image_paths` is first got as # [[image1, ..., image_i], [image_j, ..., image_n]] image_paths = [glob.glob(f) for f in filenames] # then it is flattened to get a simple list flattened_image_paths = sum(image_paths, []) images = extract_images(flattened_image_paths) model_input_size = images.shape[1] instance_args = [ name, model_input_size, network, batch_size, dropout, learning_rate, learning_rate_decay, ] instance_name = utils.list_to_str(instance_args, "_") prepro_folder = utils.prepare_preprocessed_folder(datapath, dataset, model_input_size) if os.path.isfile(prepro_folder["training_config"]): train_config = utils.read_config(prepro_folder["training_config"]) label_ids = [ x["id"] for x in train_config["labels"] if x["is_evaluate"] ] nb_labels = len(label_ids) else: raise FileNotFoundError( "There is no training data with the given parameters. " "Please generate a valid dataset before calling the program.") output_folder = utils.prepare_output_folder(datapath, dataset, model_input_size, problem) instance_path = os.path.join(output_folder, output_folder["best-instance"]) dropout, network = utils.recover_instance(instance_path) model = init_model( problem, instance_name, model_input_size, nb_labels, dropout, network, ) if os.path.isfile(output_folder["best-model"]): model.load_weights(output_folder["best-model"]) logger.info( "Model weights have been recovered from %s", output_folder["best-model"], ) else: logger.info( "No available trained model for this image size with optimized hyperparameters. " "The inference will be done on an untrained model") y_raw_pred = model.predict(images, batch_size=2, verbose=1) result = {} if problem == "featdet": label_info = [(i["category"], utils.GetHTMLColor(i["color"])) for i in train_config["labels"]] for filename, prediction in zip(flattened_image_paths, y_raw_pred): result[filename] = [(i[0], 100 * round(float(j), 2), i[1]) for i, j in zip(label_info, prediction)] return result elif problem == "semseg": os.makedirs(output_dir, exist_ok=True) predicted_labels = np.argmax(y_raw_pred, axis=3) encountered_labels = np.unique(predicted_labels) meaningful_labels = [ x for i, x in enumerate(train_config["labels"]) if i in encountered_labels ] labelled_images = np.zeros(shape=np.append(predicted_labels.shape, 3), dtype=np.int8) for i in range(nb_labels): labelled_images[predicted_labels == i] = train_config["labels"][i]["color"] for predicted_labels, filename in zip(labelled_images, flattened_image_paths): predicted_image = Image.fromarray(predicted_labels, "RGB") filename = filename.replace(".jpg", ".png") predicted_image_path = os.path.join(output_dir, os.path.basename(filename)) predicted_image.save(predicted_image_path) result[filename] = os.path.basename(filename) return { "labels": summarize_config(meaningful_labels), "label_images": result, } else: raise ValueError( "Unknown model argument. Please use 'featdet' or 'semseg'.")
def predict( filenames, dataset, problem, datapath="./data", aggregate=False, name=None, network=None, batch_size=None, dropout=None, learning_rate=None, learning_rate_decay=None, output_dir="/tmp/deeposlandia/predicted", ): """Make label prediction on image indicated by ̀filename`, according to considered `problem` Parameters ---------- filenames : str Name of the image files on the file system dataset : str Name of the dataset problem : str Name of the considered model, either `feature_detection` or `semantic_segmentation` datapath : str Relative path of dataset repository aggregate : bool Either or not the labels are aggregated name : str Name of the saved network network : str Name of the chosen architecture, either `simple`, `vgg` or `inception` batch_size : integer Batch size used for training the model dropout : float Dropout rate used for training the model learning_rate : float Learning rate used for training the model learning_rate_decay : float Learning rate decay used for training the model output_dir : str Path of the output directory, where labelled images will be stored (useful only if `problem=semantic_segmentation`) Returns ------- dict Double predictions (between 0 and 1, acts as percentages) regarding each labels """ # `image_paths` is first got as # [[image1, ..., image_i], [image_j, ..., image_n]] image_paths = [glob.glob(f) for f in filenames] # then it is flattened to get a simple list flattened_image_paths = sum(image_paths, []) images = extract_images(flattened_image_paths) model_input_size = images.shape[1] if dataset == "aerial": tile_size = utils.get_tile_size_from_image(model_input_size) else: tile_size = model_input_size aggregate_value = "full" if not aggregate else "aggregated" instance_args = [ name, tile_size, network, batch_size, aggregate_value, dropout, learning_rate, learning_rate_decay, ] instance_name = utils.list_to_str(instance_args, "_") prepro_folder = utils.prepare_preprocessed_folder( datapath, dataset, tile_size, aggregate_value ) if os.path.isfile(prepro_folder["training_config"]): train_config = utils.read_config(prepro_folder["training_config"]) label_ids = [ x["id"] for x in train_config["labels"] if x["is_evaluate"] ] nb_labels = len(label_ids) else: logger.error( ( "There is no training data with the given " "parameters. Please generate a valid dataset " "before calling the program." ) ) sys.exit(1) if any([arg is None for arg in instance_args]): logger.info( ("Some arguments are None, " "the best model is considered.") ) output_folder = utils.prepare_output_folder(datapath, dataset, problem) instance_filename = ( "best-instance-" + str(tile_size) + "-" + aggregate_value + ".json" ) instance_path = os.path.join(output_folder, instance_filename) dropout, network = utils.recover_instance(instance_path) model = init_model( problem, instance_name, model_input_size, nb_labels, dropout, network, ) checkpoint_filename = ( "best-model-" + str(tile_size) + "-" + aggregate_value + ".h5" ) checkpoint_full_path = os.path.join(output_folder, checkpoint_filename) if os.path.isfile(checkpoint_full_path): logger.info("Checkpoint full path : %s", checkpoint_full_path) model.load_weights(checkpoint_full_path) logger.info( "Model weights have been recovered from %s", checkpoint_full_path, ) else: logger.info( ( "No available trained model for this image size" " with optimized hyperparameters. The " "inference will be done on an untrained model" ) ) else: logger.info("All instance arguments are filled out.") output_folder = utils.prepare_output_folder( datapath, dataset, problem, instance_name ) model = init_model( problem, instance_name, model_input_size, nb_labels, dropout, network, ) checkpoints = [ item for item in os.listdir(output_folder) if "checkpoint-epoch" in item ] if len(checkpoints) > 0: model_checkpoint = max(checkpoints) checkpoint_full_path = os.path.join( output_folder, model_checkpoint ) model.load_weights(checkpoint_full_path) logger.info( "Model weights have been recovered from %s", checkpoint_full_path, ) else: logger.info( ( "No available checkpoint for this configuration. " "The model will be trained from scratch." ) ) y_raw_pred = model.predict(images) result = {} if problem == "feature_detection": label_info = [ (i["category"], utils.GetHTMLColor(i["color"])) for i in train_config["labels"] ] for filename, prediction in zip(flattened_image_paths, y_raw_pred): result[filename] = [ (i[0], 100 * round(float(j), 2), i[1]) for i, j in zip(label_info, prediction) ] return result elif problem == "semantic_segmentation": os.makedirs(output_dir, exist_ok=True) predicted_labels = np.argmax(y_raw_pred, axis=3) encountered_labels = np.unique(predicted_labels) meaningful_labels = [ x for i, x in enumerate(train_config["labels"]) if i in encountered_labels ] labelled_images = np.zeros( shape=np.append(predicted_labels.shape, 3), dtype=np.int8 ) for i in range(nb_labels): labelled_images[predicted_labels == i] = train_config["labels"][i][ "color" ] for predicted_labels, filename in zip( labelled_images, flattened_image_paths ): predicted_image = Image.fromarray(predicted_labels, "RGB") filename = filename.replace(".jpg", ".png") predicted_image_path = os.path.join( output_dir, os.path.basename(filename) ) predicted_image.save(predicted_image_path) result[filename] = os.path.basename(filename) return { "labels": summarize_config(meaningful_labels), "label_images": result, } else: logger.error( ( "Unknown model argument. Please use " "'feature_detection' or 'semantic_segmentation'." ) ) sys.exit(1)
def main(args): logger.info("Postprocess %s...", args.image_basename) features = get_image_features( args.datapath, args.dataset, args.image_basename ) img_width, img_height = features["width"], features["height"] logger.info("Raw image size: %s, %s" % (img_width, img_height)) prepro_folder = utils.prepare_preprocessed_folder(args.datapath, args.dataset, args.image_size) image_paths = get_image_paths(prepro_folder["testing"], args.image_basename) logger.info("The image will be splitted into %s tiles" % len(image_paths)) images = extract_images(image_paths) coordinates = extract_coordinates_from_filenames(image_paths) labels = get_labels(args.datapath, args.dataset, args.image_size) output_folder = utils.prepare_output_folder( args.datapath, args.dataset, args.image_size, "semseg" ) model = get_trained_model( output_folder["best-model"], args.image_size, len(labels) ) logger.info("Predict labels for input images...") data = build_full_labelled_image( images, coordinates, model, args.image_size, img_width, img_height, args.batch_size, ) logger.info( "Labelled image dimension: %s, %s" % (data.shape[0], data.shape[1]) ) colored_data = assign_label_colors(data, labels) if args.draw_grid: colored_data = draw_grid( colored_data, img_width, img_height, args.image_size ) predicted_label_file = os.path.join( output_folder["labels"], args.image_basename + "_" + str(args.image_size) + ".png", ) Image.fromarray(colored_data).save(predicted_label_file) vectorized_labels, vectorized_data = geometries.vectorize_mask( data, colored_data, labels ) gdf = gpd.GeoDataFrame( {"labels": vectorized_labels, "geometry": vectorized_data} ) predicted_geom_file = os.path.join( output_folder["geometries"], args.image_basename + "_" + str(args.image_size) + ".geojson", ) if os.path.isfile(predicted_geom_file): os.remove(predicted_geom_file) gdf.to_file(predicted_geom_file, driver="GeoJSON") rasterized_data = geometries.rasterize_polygons( vectorized_data, vectorized_labels, img_height, img_width ) colored_raster_data = assign_label_colors(rasterized_data, labels) if args.draw_grid: colored_raster_data = draw_grid( colored_raster_data, img_width, img_height, args.image_size ) predicted_raster_file = os.path.join( output_folder["rasters"], args.image_basename + "_" + str(args.image_size) + ".png", ) Image.fromarray(colored_raster_data).save(predicted_raster_file)