def test_wrong_model_dataset_generator(shapes_sample_config): """Test a wrong model and wrong dataset """ dataset = "fake" model = "conquer_the_world" IMAGE_SIZE = 10 BATCH_SIZE = 10 datapath = "./tests/data/" + dataset + "/training" config = utils.read_config(shapes_sample_config) # wrong dataset name with pytest.raises(ValueError) as excinfo: generator.create_generator( dataset, "featdet", datapath, IMAGE_SIZE, BATCH_SIZE, config["labels"], ) assert str(excinfo.value) == "Wrong dataset name {}".format(dataset) # wrong model name with pytest.raises(ValueError) as excinfo: generator.create_generator("shapes", model, datapath, IMAGE_SIZE, BATCH_SIZE, config["labels"]) expected_failure_msg = "Wrong model name {} (choose amongst {})".format( model, AVAILABLE_MODELS) assert str(excinfo.value) == expected_failure_msg
def test_semseg_aerial_generator(aerial_image_size, aerial_sample, aerial_sample_config, nb_channels): """Test the data generator for the AerialImage dataset """ BATCH_SIZE = 4 config = utils.read_config(aerial_sample_config) label_ids = [x["id"] for x in config["labels"]] gen = generator.create_generator( "aerial", "semseg", aerial_sample, aerial_image_size, BATCH_SIZE, config["labels"], ) item = next(gen) assert len(item) == 2 im_shape = item[0].shape assert im_shape == ( BATCH_SIZE, aerial_image_size, aerial_image_size, nb_channels, ) label_shape = item[1].shape assert label_shape == ( BATCH_SIZE, aerial_image_size, aerial_image_size, len(label_ids), )
def test_semseg_tanzania_generator(tanzania_image_size, tanzania_sample, tanzania_sample_config, nb_channels): """Test the data generator for the Open AI Tanzania dataset """ BATCH_SIZE = 3 config = utils.read_config(tanzania_sample_config) label_ids = [x["id"] for x in config["labels"]] gen = generator.create_generator( "tanzania", "semseg", tanzania_sample, tanzania_image_size, BATCH_SIZE, config["labels"], ) item = next(gen) assert len(item) == 2 im_shape = item[0].shape assert im_shape == ( BATCH_SIZE, tanzania_image_size, tanzania_image_size, nb_channels, ) label_shape = item[1].shape assert label_shape == ( BATCH_SIZE, tanzania_image_size, tanzania_image_size, len(label_ids), )
def test_featdet_shape_generator(shapes_image_size, shapes_sample, shapes_sample_config, nb_channels): """Test the data generator for the shape dataset """ BATCH_SIZE = 10 config = utils.read_config(shapes_sample_config) label_ids = [x["id"] for x in config["labels"]] gen = generator.create_generator( "shapes", "featdet", shapes_sample, shapes_image_size, BATCH_SIZE, config["labels"], ) item = next(gen) assert len(item) == 2 im_shape = item[0].shape assert im_shape == ( BATCH_SIZE, shapes_image_size, shapes_image_size, nb_channels, ) label_shape = item[1].shape assert label_shape == (BATCH_SIZE, len(label_ids))
def test_semseg_mapillary_generator( mapillary_image_size, mapillary_sample, mapillary_sample_config, nb_channels, ): """Test the data generator for the Mapillary dataset """ BATCH_SIZE = 10 config = utils.read_config(mapillary_sample_config) label_ids = [x["id"] for x in config["labels"]] gen = generator.create_generator( "mapillary", "semseg", mapillary_sample, mapillary_image_size, BATCH_SIZE, config["labels"], ) item = next(gen) assert len(item) == 2 im_shape = item[0].shape assert im_shape == ( BATCH_SIZE, mapillary_image_size, mapillary_image_size, nb_channels, ) label_shape = item[1].shape assert label_shape == ( BATCH_SIZE, mapillary_image_size, mapillary_image_size, len(label_ids), )
def build_glossary(self, config_filename): """Read the Mapillary glossary stored as a json file at the data repository root Parameters ---------- config_filename : str String designing the relative path of the dataset glossary (based on Mapillary dataset) """ glossary = utils.read_config(config_filename) if "labels" not in glossary: logger.error("There is no 'label' key in the provided glossary.") return None for lab_id, label in enumerate(glossary["labels"]): if "aggregate" in config_filename: self.add_label( lab_id, label["name"], label["color"], label["evaluate"], label["family"], label["contains_id"], label["contains"], ) else: name_items = label["name"].split("--") self.add_label( lab_id, name_items[-1], label["color"], label["evaluate"], name_items[0], )
def get_labels(datapath, dataset, tile_size): """Extract labels from the `dataset` glossary, according to the preprocessed version of the dataset Parameters ---------- datapath : str Path of the data on the file system dataset : str Name of the dataset tile_size : int Size of preprocessed images, in pixels Returns ------- list List of dictionnaries that describes the dataset labels """ prepro_folder = utils.prepare_preprocessed_folder( datapath, dataset, tile_size, ) if os.path.isfile(prepro_folder["testing_config"]): test_config = utils.read_config(prepro_folder["testing_config"]) else: raise ValueError(("There is no testing data with the given " "parameters. Please generate a valid dataset " "before calling the program.")) return [l for l in test_config["labels"] if l["is_evaluate"]]
def test_model_backup_loading(shapes_image_size, shapes_sample_config, shapes_temp_dir): """Test the model checkpoint recovering """ config = read_config(shapes_sample_config) label_ids = [x['id'] for x in config['labels'] if x['is_evaluate']] cnn = FeatureDetectionNetwork("test", image_size=shapes_image_size, nb_labels=len(label_ids)) model = Model(cnn.X, cnn.Y) old_weights = model.get_weights() checkpoint_path = os.path.join(str(shapes_temp_dir), "checkpoints") if os.path.isdir(checkpoint_path): checkpoints = os.listdir(checkpoint_path) if len(checkpoints) > 0: model_checkpoint = max(checkpoints) trained_model_epoch = int(model_checkpoint[-5:-3]) checkpoint_complete_path = os.path.join(checkpoint_path, model_checkpoint) model.load_weights(checkpoint_complete_path) else: trained_model_epoch = 0 new_weights = model.get_weights() assert trained_model_epoch > 0 assert len(old_weights) == len(new_weights) assert old_weights[0].shape == new_weights[0].shape # Test if old and new weights are different (at least for one layer) assert any(not np.allclose(lhs, rhs) for lhs, rhs in zip(old_weights, new_weights))
def get_data(folders, dataset, model, image_size, batch_size): """On the file system, recover `dataset` that can solve `model` problem Parameters ---------- folders : dict Dictionary of useful folders that indicates paths to data dataset : str Name of the used dataset (*e.g.* `shapes` or `mapillary`) model : str Name of the addressed research problem (*e.g.* `feature_detection` or `semantic_segmentation`) image_size : int Size of the images, in pixel (height=width) batch_size : int Number of images in each batch Returns ------- tuple Number of labels in the dataset, as well as training and validation data generators """ # Data gathering if os.path.isfile(folders["training_config"]): train_config = utils.read_config(folders["training_config"]) label_ids = [ x["id"] for x in train_config["labels"] if x["is_evaluate"] ] train_generator = generator.create_generator( dataset, model, folders["training"], image_size, batch_size, train_config["labels"], seed=SEED, ) else: raise FileNotFoundError( "There is no training data with the given parameters. Please " "generate a valid dataset before calling the training program.") if os.path.isfile(folders["validation_config"]): validation_generator = generator.create_generator( dataset, model, folders["validation"], image_size, batch_size, train_config["labels"], seed=SEED, ) else: raise FileNotFoundError( "There is no validation data with the given parameters. Please " "generate a valid dataset before calling the training program.") nb_labels = len(label_ids) return nb_labels, train_generator, validation_generator
def test_model_training( shapes_image_size, shapes_sample, shapes_sample_config, shapes_temp_dir, shapes_nb_images, ): """Test the training of a simple neural network with Keras API, as well as model inference and trained model backup One big test function to avoid duplicating the training operations (that can be long) """ BATCH_SIZE = 10 NB_EPOCHS = 1 NB_STEPS = shapes_nb_images // BATCH_SIZE config = read_config(shapes_sample_config) label_ids = [x["id"] for x in config["labels"] if x["is_evaluate"]] gen = create_generator( "shapes", "featdet", shapes_sample, shapes_image_size, BATCH_SIZE, config["labels"], ) cnn = FeatureDetectionNetwork("test", image_size=shapes_image_size, nb_labels=len(label_ids)) model = Model(cnn.X, cnn.Y) model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"]) hist = model.fit_generator(gen, epochs=NB_EPOCHS, steps_per_epoch=NB_STEPS) assert len(hist.history) == 2 assert all(k in hist.history.keys() for k in ["acc", "loss"]) assert hist.history["acc"][0] >= 0 and hist.history["acc"][0] <= 1 test_image = np.random.randint( 0, 255, [BATCH_SIZE, shapes_image_size, shapes_image_size, 3]) score = model.predict(test_image) assert score.shape == (BATCH_SIZE, len(label_ids)) assert all(0 <= s and s <= 1 for s in score.ravel()) BACKUP_FILENAME = os.path.join( str(shapes_temp_dir), "checkpoints", "test_model_{:02d}.h5".format(NB_EPOCHS), ) model.save(BACKUP_FILENAME) assert os.path.isfile(BACKUP_FILENAME)
def main(datadir): """Generate a new config.json file with aggregated labels. Parameters ---------- datadir : str Returns ------- dict """ config = utils.read_config(os.path.join(datadir, 'config.json')) df = config_as_dataframe(config) agg_config = aggregate_config(config, df) return agg_config
def test_featdet_mapillary_generator(mapillary_image_size, mapillary_sample, mapillary_sample_config, nb_channels): """Test the data generator for the Mapillary dataset """ BATCH_SIZE = 10 config = utils.read_config(mapillary_sample_config) label_ids = [x['id'] for x in config["labels"]] gen = generator.create_generator("mapillary", "feature_detection", mapillary_sample, mapillary_image_size, BATCH_SIZE, config["labels"]) item = next(gen) assert (len(item) == 2) im_shape = item[0].shape assert im_shape == (BATCH_SIZE, mapillary_image_size, mapillary_image_size, nb_channels) label_shape = item[1].shape assert label_shape == (BATCH_SIZE, len(label_ids))
def test_semseg_shape_generator(shapes_image_size, shapes_sample, shapes_sample_config, nb_channels): """Test the data generator for the shape dataset """ BATCH_SIZE = 10 config = utils.read_config(shapes_sample_config) label_ids = [x['id'] for x in config["labels"]] gen = generator.create_generator("shapes", "semantic_segmentation", shapes_sample, shapes_image_size, BATCH_SIZE, config["labels"]) item = next(gen) assert len(item) == 2 im_shape = item[0].shape assert im_shape == (BATCH_SIZE, shapes_image_size, shapes_image_size, nb_channels) label_shape = item[1].shape assert label_shape == (BATCH_SIZE, shapes_image_size, shapes_image_size, len(label_ids))
def test_wrong_model_dataset_generator(shapes_sample_config): """Test a wrong model and wrong dataset """ dataset = "fake" model = "conquer_the_world" IMAGE_SIZE = 10 BATCH_SIZE = 10 datapath = ("./tests/data/" + dataset + "/training") config = utils.read_config(shapes_sample_config) # wrong model name with pytest.raises(ValueError) as excinfo: generator.create_generator(dataset, 'feature_detection', datapath, IMAGE_SIZE, BATCH_SIZE, config["labels"]) assert str(excinfo.value) == "Wrong dataset name {}".format(dataset) # wrong model name with pytest.raises(ValueError) as excinfo: generator.create_generator('shapes', model, datapath, IMAGE_SIZE, BATCH_SIZE, config["labels"]) assert str(excinfo.value) == "Wrong model name {}".format(model)
def predict( filenames, dataset, problem, datapath="./data", name=None, network=None, batch_size=None, dropout=None, learning_rate=None, learning_rate_decay=None, output_dir="/tmp/deeposlandia/predicted", ): """Make label prediction on image indicated by ̀filename`, according to considered `problem` Parameters ---------- filenames : str Name of the image files on the file system dataset : str Name of the dataset problem : str Name of the considered model, either `featdet` or `semseg` datapath : str Relative path of dataset repository name : str Name of the saved network network : str Name of the chosen architecture, either `simple`, `vgg` or `inception` batch_size : integer Batch size used for training the model dropout : float Dropout rate used for training the model learning_rate : float Learning rate used for training the model learning_rate_decay : float Learning rate decay used for training the model output_dir : str Path of the output directory, where labelled images will be stored (useful only if `problem=semantic_segmentation`) Returns ------- dict Double predictions (between 0 and 1, acts as percentages) regarding each labels """ # `image_paths` is first got as # [[image1, ..., image_i], [image_j, ..., image_n]] image_paths = [glob.glob(f) for f in filenames] # then it is flattened to get a simple list flattened_image_paths = sum(image_paths, []) images = extract_images(flattened_image_paths) model_input_size = images.shape[1] instance_args = [ name, model_input_size, network, batch_size, dropout, learning_rate, learning_rate_decay, ] instance_name = utils.list_to_str(instance_args, "_") prepro_folder = utils.prepare_preprocessed_folder(datapath, dataset, model_input_size) if os.path.isfile(prepro_folder["training_config"]): train_config = utils.read_config(prepro_folder["training_config"]) label_ids = [ x["id"] for x in train_config["labels"] if x["is_evaluate"] ] nb_labels = len(label_ids) else: raise FileNotFoundError( "There is no training data with the given parameters. " "Please generate a valid dataset before calling the program.") output_folder = utils.prepare_output_folder(datapath, dataset, model_input_size, problem) instance_path = os.path.join(output_folder, output_folder["best-instance"]) dropout, network = utils.recover_instance(instance_path) model = init_model( problem, instance_name, model_input_size, nb_labels, dropout, network, ) if os.path.isfile(output_folder["best-model"]): model.load_weights(output_folder["best-model"]) logger.info( "Model weights have been recovered from %s", output_folder["best-model"], ) else: logger.info( "No available trained model for this image size with optimized hyperparameters. " "The inference will be done on an untrained model") y_raw_pred = model.predict(images, batch_size=2, verbose=1) result = {} if problem == "featdet": label_info = [(i["category"], utils.GetHTMLColor(i["color"])) for i in train_config["labels"]] for filename, prediction in zip(flattened_image_paths, y_raw_pred): result[filename] = [(i[0], 100 * round(float(j), 2), i[1]) for i, j in zip(label_info, prediction)] return result elif problem == "semseg": os.makedirs(output_dir, exist_ok=True) predicted_labels = np.argmax(y_raw_pred, axis=3) encountered_labels = np.unique(predicted_labels) meaningful_labels = [ x for i, x in enumerate(train_config["labels"]) if i in encountered_labels ] labelled_images = np.zeros(shape=np.append(predicted_labels.shape, 3), dtype=np.int8) for i in range(nb_labels): labelled_images[predicted_labels == i] = train_config["labels"][i]["color"] for predicted_labels, filename in zip(labelled_images, flattened_image_paths): predicted_image = Image.fromarray(predicted_labels, "RGB") filename = filename.replace(".jpg", ".png") predicted_image_path = os.path.join(output_dir, os.path.basename(filename)) predicted_image.save(predicted_image_path) result[filename] = os.path.basename(filename) return { "labels": summarize_config(meaningful_labels), "label_images": result, } else: raise ValueError( "Unknown model argument. Please use 'featdet' or 'semseg'.")
def predict( filenames, dataset, problem, datapath="./data", aggregate=False, name=None, network=None, batch_size=None, dropout=None, learning_rate=None, learning_rate_decay=None, output_dir="/tmp/deeposlandia/predicted", ): """Make label prediction on image indicated by ̀filename`, according to considered `problem` Parameters ---------- filenames : str Name of the image files on the file system dataset : str Name of the dataset problem : str Name of the considered model, either `feature_detection` or `semantic_segmentation` datapath : str Relative path of dataset repository aggregate : bool Either or not the labels are aggregated name : str Name of the saved network network : str Name of the chosen architecture, either `simple`, `vgg` or `inception` batch_size : integer Batch size used for training the model dropout : float Dropout rate used for training the model learning_rate : float Learning rate used for training the model learning_rate_decay : float Learning rate decay used for training the model output_dir : str Path of the output directory, where labelled images will be stored (useful only if `problem=semantic_segmentation`) Returns ------- dict Double predictions (between 0 and 1, acts as percentages) regarding each labels """ # `image_paths` is first got as # [[image1, ..., image_i], [image_j, ..., image_n]] image_paths = [glob.glob(f) for f in filenames] # then it is flattened to get a simple list flattened_image_paths = sum(image_paths, []) images = extract_images(flattened_image_paths) model_input_size = images.shape[1] if dataset == "aerial": tile_size = utils.get_tile_size_from_image(model_input_size) else: tile_size = model_input_size aggregate_value = "full" if not aggregate else "aggregated" instance_args = [ name, tile_size, network, batch_size, aggregate_value, dropout, learning_rate, learning_rate_decay, ] instance_name = utils.list_to_str(instance_args, "_") prepro_folder = utils.prepare_preprocessed_folder( datapath, dataset, tile_size, aggregate_value ) if os.path.isfile(prepro_folder["training_config"]): train_config = utils.read_config(prepro_folder["training_config"]) label_ids = [ x["id"] for x in train_config["labels"] if x["is_evaluate"] ] nb_labels = len(label_ids) else: logger.error( ( "There is no training data with the given " "parameters. Please generate a valid dataset " "before calling the program." ) ) sys.exit(1) if any([arg is None for arg in instance_args]): logger.info( ("Some arguments are None, " "the best model is considered.") ) output_folder = utils.prepare_output_folder(datapath, dataset, problem) instance_filename = ( "best-instance-" + str(tile_size) + "-" + aggregate_value + ".json" ) instance_path = os.path.join(output_folder, instance_filename) dropout, network = utils.recover_instance(instance_path) model = init_model( problem, instance_name, model_input_size, nb_labels, dropout, network, ) checkpoint_filename = ( "best-model-" + str(tile_size) + "-" + aggregate_value + ".h5" ) checkpoint_full_path = os.path.join(output_folder, checkpoint_filename) if os.path.isfile(checkpoint_full_path): logger.info("Checkpoint full path : %s", checkpoint_full_path) model.load_weights(checkpoint_full_path) logger.info( "Model weights have been recovered from %s", checkpoint_full_path, ) else: logger.info( ( "No available trained model for this image size" " with optimized hyperparameters. The " "inference will be done on an untrained model" ) ) else: logger.info("All instance arguments are filled out.") output_folder = utils.prepare_output_folder( datapath, dataset, problem, instance_name ) model = init_model( problem, instance_name, model_input_size, nb_labels, dropout, network, ) checkpoints = [ item for item in os.listdir(output_folder) if "checkpoint-epoch" in item ] if len(checkpoints) > 0: model_checkpoint = max(checkpoints) checkpoint_full_path = os.path.join( output_folder, model_checkpoint ) model.load_weights(checkpoint_full_path) logger.info( "Model weights have been recovered from %s", checkpoint_full_path, ) else: logger.info( ( "No available checkpoint for this configuration. " "The model will be trained from scratch." ) ) y_raw_pred = model.predict(images) result = {} if problem == "feature_detection": label_info = [ (i["category"], utils.GetHTMLColor(i["color"])) for i in train_config["labels"] ] for filename, prediction in zip(flattened_image_paths, y_raw_pred): result[filename] = [ (i[0], 100 * round(float(j), 2), i[1]) for i, j in zip(label_info, prediction) ] return result elif problem == "semantic_segmentation": os.makedirs(output_dir, exist_ok=True) predicted_labels = np.argmax(y_raw_pred, axis=3) encountered_labels = np.unique(predicted_labels) meaningful_labels = [ x for i, x in enumerate(train_config["labels"]) if i in encountered_labels ] labelled_images = np.zeros( shape=np.append(predicted_labels.shape, 3), dtype=np.int8 ) for i in range(nb_labels): labelled_images[predicted_labels == i] = train_config["labels"][i][ "color" ] for predicted_labels, filename in zip( labelled_images, flattened_image_paths ): predicted_image = Image.fromarray(predicted_labels, "RGB") filename = filename.replace(".jpg", ".png") predicted_image_path = os.path.join( output_dir, os.path.basename(filename) ) predicted_image.save(predicted_image_path) result[filename] = os.path.basename(filename) return { "labels": summarize_config(meaningful_labels), "label_images": result, } else: logger.error( ( "Unknown model argument. Please use " "'feature_detection' or 'semantic_segmentation'." ) ) sys.exit(1)
args = parser.parse_args() # Data path and repository management aggregate_value = "full" if not args.aggregate_label else "aggregated" instance_args = [args.name, args.image_size, args.network, args.batch_size, aggregate_value, args.dropout, args.learning_rate, args.learning_rate_decay] instance_name = utils.list_to_str(instance_args, "_") prepro_folder = utils.prepare_preprocessed_folder(args.datapath, args.dataset, args.image_size, aggregate_value) # Data gathering if (os.path.isfile(prepro_folder["training_config"]) and os.path.isfile(prepro_folder["validation_config"]) and os.path.isfile(prepro_folder["testing_config"])): train_config = utils.read_config(prepro_folder["training_config"]) label_ids = [x['id'] for x in train_config['labels'] if x['is_evaluate']] train_generator = generator.create_generator( args.dataset, args.model, prepro_folder["training"], args.image_size, args.batch_size, label_ids, seed=SEED) validation_generator = generator.create_generator( args.dataset, args.model, prepro_folder["validation"], args.image_size, args.batch_size,
def get_data(folders, dataset, model, image_size, batch_size): """On the file system, recover `dataset` that can solve `model` problem Parameters ---------- folders : dict Dictionary of useful folders that indicates paths to data dataset : str Name of the used dataset (*e.g.* `shapes` or `mapillary`) model : str Name of the addressed research problem (*e.g.* `feature_detection` or `semantic_segmentation`) image_size : int Size of the images, in pixel (height=width) batch_size : int Number of images in each batch Returns ------- tuple Number of labels in the dataset, as well as training, validation and testing data generators """ # Data gathering if (os.path.isfile(folders["training_config"]) and os.path.isfile(folders["validation_config"]) and os.path.isfile(folders["testing_config"])): train_config = utils.read_config(folders["training_config"]) label_ids = [ x['id'] for x in train_config['labels'] if x['is_evaluate'] ] train_generator = generator.create_generator(dataset, model, folders["training"], image_size, batch_size, train_config["labels"], seed=SEED) validation_generator = generator.create_generator( dataset, model, folders["validation"], image_size, batch_size, train_config["labels"], seed=SEED) test_generator = generator.create_generator(dataset, model, folders["testing"], image_size, batch_size, train_config["labels"], inference=True, seed=SEED) else: utils.logger.error(( "There is no valid data with the specified parameters. " "Please generate a valid dataset before calling the training program." )) sys.exit(1) nb_labels = len(label_ids) return nb_labels, train_generator, validation_generator, test_generator