resize=False, variable_image_size=True, verbose=True) # 3: Set the batch size. batch_size = 32 # Change the batch size if you like, or if you run into GPU memory issues. # 4: Set the image transformations for pre-processing and data augmentation options. # For the training generator: ssd_data_augmentation = SSDDataAugmentation(img_height=img_height, img_width=img_width, background=mean_color) convert_to_3_channels = ConvertTo3Channels() resize = Resize(height=img_height, width=img_width) # 5: Instantiate an encoder that can encode ground truth labels into the format needed by the SSD loss function. # The encoder constructor needs the spatial dimensions of the model's predictor layers to create the anchor boxes. predictor_sizes = [ model.get_layer('conv4_3_norm_mbox_conf').output_shape[1:3], model.get_layer('fc7_mbox_conf').output_shape[1:3], model.get_layer('conv6_2_mbox_conf').output_shape[1:3], model.get_layer('conv7_2_mbox_conf').output_shape[1:3], model.get_layer('conv8_2_mbox_conf').output_shape[1:3], model.get_layer('conv9_2_mbox_conf').output_shape[1:3] ] ssd_input_encoder = SSDInputEncoder(img_height=img_height,
def __init__( self, random_brightness=(-48, 48, 0.5), random_contrast=(0.5, 1.8, 0.5), random_saturation=(0.5, 1.8, 0.5), random_hue=(18, 0.5), random_flip=0.5, random_translate=((0.03, 0.5), (0.03, 0.5), 0.5), random_scale=(0.5, 2.0, 0.5), random_gaussian_noise=(0.5, 0., 10), # gaussine noise random_poisson_noise=(0.5, 60), # poisson noise random_salt_pepper_noise=(0.5, 0.5, 0.005), # salt&pepper or impalse noise random_row_defect=(0.5, 1), # row defect random_col_defect=(0.5, 1), # col defect n_trials_max=3, clip_boxes=True, overlap_criterion='area', bounds_box_filter=(0.3, 1.0), bounds_validator=(0.5, 1.0), n_boxes_min=1, background=(0, 0, 0), labels_format={ 'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4 }): if (random_scale[0] >= 1) or (random_scale[1] <= 1): raise ValueError( "This sequence of transformations only makes sense if the minimum scaling factor is <1 and the maximum scaling factor is >1." ) self.n_trials_max = n_trials_max self.clip_boxes = clip_boxes self.overlap_criterion = overlap_criterion self.bounds_box_filter = bounds_box_filter self.bounds_validator = bounds_validator self.n_boxes_min = n_boxes_min self.background = background self.labels_format = labels_format # Determines which boxes are kept in an image after the transformations have been applied. self.box_filter = BoxFilter(check_overlap=True, check_min_area=True, check_degenerate=True, overlap_criterion=self.overlap_criterion, overlap_bounds=self.bounds_box_filter, min_area=16, labels_format=self.labels_format) # Determines whether the result of the transformations is a valid training image. self.image_validator = ImageValidator( overlap_criterion=self.overlap_criterion, bounds=self.bounds_validator, n_boxes_min=self.n_boxes_min, labels_format=self.labels_format) # Utility distortions self.convert_RGB_to_HSV = ConvertColor(current='RGB', to='HSV') self.convert_HSV_to_RGB = ConvertColor(current='HSV', to='RGB') self.convert_to_float32 = ConvertDataType(to='float32') self.convert_to_uint8 = ConvertDataType(to='uint8') self.convert_to_3_channels = ConvertTo3Channels( ) # Make sure all images end up having 3 channels. self.convert_to_1_channel = ConvertTo1Channel( ) # Make sure all images end up having 3 channels. # Photometric transformations self.random_brightness = RandomBrightness(lower=random_brightness[0], upper=random_brightness[1], prob=random_brightness[2]) self.random_contrast = RandomContrast(lower=random_contrast[0], upper=random_contrast[1], prob=random_contrast[2]) self.random_saturation = RandomSaturation(lower=random_saturation[0], upper=random_saturation[1], prob=random_saturation[2]) self.random_hue = RandomHue(max_delta=random_hue[0], prob=random_hue[1]) # Geometric transformations self.random_flip = RandomFlip(dim='horizontal', prob=random_flip, labels_format=self.labels_format) self.random_translate = RandomTranslate( dy_minmax=random_translate[0], dx_minmax=random_translate[1], prob=random_translate[2], clip_boxes=self.clip_boxes, box_filter=self.box_filter, image_validator=self.image_validator, n_trials_max=self.n_trials_max, background=self.background, labels_format=self.labels_format) self.random_zoom_in = RandomScale(min_factor=1.0, max_factor=random_scale[1], prob=random_scale[2], clip_boxes=self.clip_boxes, box_filter=self.box_filter, image_validator=self.image_validator, n_trials_max=self.n_trials_max, background=self.background, labels_format=self.labels_format) self.random_zoom_out = RandomScale( min_factor=random_scale[0], max_factor=1.0, prob=random_scale[2], clip_boxes=self.clip_boxes, box_filter=self.box_filter, image_validator=self.image_validator, n_trials_max=self.n_trials_max, background=self.background, labels_format=self.labels_format) # noises and sensor defects self.random_RowDefect = RandomRowDefect(prob=random_row_defect[0], thikness=random_row_defect[1]) self.random_col_defect = RandomColDefect(prob=random_col_defect[0], thikness=random_col_defect[1]) self.random_salt_pepper = RandomSaltPepperNoise( prob=random_salt_pepper_noise[0], salt_vs_pepper_ratio=random_salt_pepper_noise[1], percentage=random_salt_pepper_noise[2]) self.random_poisson = RandomPoissonNoise( prob=random_poisson_noise[0], Lambda=random_poisson_noise[1]) self.random_gaussian = RandomGaussianNoise( prob=random_gaussian_noise[0], mean=random_gaussian_noise[1], sigma=random_gaussian_noise[2]) # If we zoom in, do translation before scaling. self.sequence1 = [ self.convert_to_1_channel, self.convert_to_float32, self.random_brightness, self.random_contrast, # self.convert_to_uint8, # self.convert_RGB_to_HSV, # self.convert_to_float32, # self.random_saturation, # self.random_hue, self.convert_to_uint8, # self.convert_HSV_to_RGB, self.random_translate, self.random_zoom_in, self.random_flip, self.random_salt_pepper, self.random_poisson, self.random_gaussian, self.random_col_defect, self.convert_to_1_channel ] # If we zoom out, do scaling before translation. self.sequence2 = [ self.convert_to_1_channel, self.convert_to_float32, self.random_brightness, # self.convert_to_uint8, # self.convert_RGB_to_HSV, # self.convert_to_float32, # self.random_saturation, # self.random_hue, # # self.convert_to_uint8, # self.convert_HSV_to_RGB, self.convert_to_float32, self.random_contrast, self.convert_to_uint8, self.random_zoom_out, self.random_translate, self.random_flip, self.random_salt_pepper, self.random_poisson, self.random_gaussian, self.random_col_defect, self.convert_to_1_channel ]
def __init__(self, resize_height, resize_width, random_brightness=(-48, 48, 0.5), random_contrast=(0.5, 1.8, 0.5), random_saturation=(0.5, 1.8, 0.5), random_hue=(18, 0.5), random_flip=0.5, random_rotate=([90, 180, 270], 0.5), min_scale=0.3, max_scale=2.0, min_aspect_ratio = 0.8, max_aspect_ratio = 1.25, n_trials_max=3, clip_boxes=True, overlap_criterion='area', bounds_box_filter=(0.3, 1.0), bounds_validator=(0.5, 1.0), n_boxes_min=1, background=(0,0,0), labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): self.n_trials_max = n_trials_max self.clip_boxes = clip_boxes self.overlap_criterion = overlap_criterion self.bounds_box_filter = bounds_box_filter self.bounds_validator = bounds_validator self.n_boxes_min = n_boxes_min self.background = background self.labels_format = labels_format # Determines which boxes are kept in an image after the transformations have been applied. self.box_filter_patch = BoxFilter(check_overlap=True, check_min_area=False, check_degenerate=False, overlap_criterion=self.overlap_criterion, overlap_bounds=self.bounds_box_filter, labels_format=self.labels_format) self.box_filter_resize = BoxFilter(check_overlap=False, check_min_area=True, check_degenerate=True, min_area=16, labels_format=self.labels_format) # Determines whether the result of the transformations is a valid training image. self.image_validator = ImageValidator(overlap_criterion=self.overlap_criterion, bounds=self.bounds_validator, n_boxes_min=self.n_boxes_min, labels_format=self.labels_format) # Utility transformations self.convert_to_3_channels = ConvertTo3Channels() # Make sure all images end up having 3 channels. self.convert_RGB_to_HSV = ConvertColor(current='RGB', to='HSV') self.convert_HSV_to_RGB = ConvertColor(current='HSV', to='RGB') self.convert_to_float32 = ConvertDataType(to='float32') self.convert_to_uint8 = ConvertDataType(to='uint8') self.resize = Resize(height=resize_height, width=resize_width, box_filter=self.box_filter_resize, labels_format=self.labels_format) # Photometric transformations self.random_brightness = RandomBrightness(lower=random_brightness[0], upper=random_brightness[1], prob=random_brightness[2]) self.random_contrast = RandomContrast(lower=random_contrast[0], upper=random_contrast[1], prob=random_contrast[2]) self.random_saturation = RandomSaturation(lower=random_saturation[0], upper=random_saturation[1], prob=random_saturation[2]) self.random_hue = RandomHue(max_delta=random_hue[0], prob=random_hue[1]) # Geometric transformations self.random_horizontal_flip = RandomFlip(dim='horizontal', prob=random_flip, labels_format=self.labels_format) self.random_vertical_flip = RandomFlip(dim='vertical', prob=random_flip, labels_format=self.labels_format) self.random_rotate = RandomRotate(angles=random_rotate[0], prob=random_rotate[1], labels_format=self.labels_format) self.patch_coord_generator = PatchCoordinateGenerator(must_match='w_ar', min_scale=min_scale, max_scale=max_scale, scale_uniformly=False, min_aspect_ratio = min_aspect_ratio, max_aspect_ratio = max_aspect_ratio) self.random_patch = RandomPatch(patch_coord_generator=self.patch_coord_generator, box_filter=self.box_filter_patch, image_validator=self.image_validator, n_trials_max=self.n_trials_max, clip_boxes=self.clip_boxes, prob=1.0, can_fail=False, labels_format=self.labels_format) # Define the processing chain. self.transformations = [self.convert_to_3_channels, self.convert_to_float32, self.random_brightness, self.random_contrast, self.convert_to_uint8, self.convert_RGB_to_HSV, self.convert_to_float32, self.random_saturation, self.random_hue, self.convert_to_uint8, self.convert_HSV_to_RGB, self.random_horizontal_flip, self.random_vertical_flip, self.random_rotate, self.random_patch, self.resize]
def predict_on_dataset(self, img_height, img_width, batch_size, data_generator_mode='resize', decoding_confidence_thresh=0.01, decoding_iou_threshold=0.45, decoding_top_k=200, decoding_pred_coords='centroids', decoding_normalize_coords=True, decoding_border_pixels='include', round_confidences=False, verbose=True, ret=False): ''' Runs predictions for the given model over the entire dataset given by `data_generator`. Arguments: img_height (int): The input image height for the model. img_width (int): The input image width for the model. batch_size (int): The batch size for the evaluation. data_generator_mode (str, optional): Either of 'resize' and 'pad'. If 'resize', the input images will be resized (i.e. warped) to `(img_height, img_width)`. This mode does not preserve the aspect ratios of the images. If 'pad', the input images will be first padded so that they have the aspect ratio defined by `img_height` and `img_width` and then resized to `(img_height, img_width)`. This mode preserves the aspect ratios of the images. decoding_confidence_thresh (float, optional): Only relevant if the model is in 'training' mode. A float in [0,1), the minimum classification confidence in a specific positive class in order to be considered for the non-maximum suppression stage for the respective class. A lower value will result in a larger part of the selection process being done by the non-maximum suppression stage, while a larger value will result in a larger part of the selection process happening in the confidence thresholding stage. decoding_iou_threshold (float, optional): Only relevant if the model is in 'training' mode. A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers to the box score. decoding_top_k (int, optional): Only relevant if the model is in 'training' mode. The number of highest scoring predictions to be kept for each batch item after the non-maximum suppression stage. decoding_input_coords (str, optional): Only relevant if the model is in 'training' mode. The box coordinate format that the model outputs. Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`. decoding_normalize_coords (bool, optional): Only relevant if the model is in 'training' mode. Set to `True` if the model outputs relative coordinates. Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect coordinates. round_confidences (int, optional): `False` or an integer that is the number of decimals that the prediction confidences will be rounded to. If `False`, the confidences will not be rounded. verbose (bool, optional): If `True`, will print out the progress during runtime. ret (bool, optional): If `True`, returns the predictions. Returns: None by default. Optionally, a nested list containing the predictions for each class. ''' class_id_pred = self.pred_format['class_id'] conf_pred = self.pred_format['conf'] xmin_pred = self.pred_format['xmin'] ymin_pred = self.pred_format['ymin'] xmax_pred = self.pred_format['xmax'] ymax_pred = self.pred_format['ymax'] ############################################################################################# # Configure the data generator for the evaluation. ############################################################################################# convert_to_3_channels = ConvertTo3Channels() resize = Resize(height=img_height,width=img_width, labels_format=self.gt_format) if data_generator_mode == 'resize': transformations = [convert_to_3_channels, resize] elif data_generator_mode == 'pad': random_pad = RandomPadFixedAR(patch_aspect_ratio=img_width/img_height, labels_format=self.gt_format) transformations = [convert_to_3_channels, random_pad, resize] else: raise ValueError("`data_generator_mode` can be either of 'resize' or 'pad', but received '{}'.".format(data_generator_mode)) # Set the generator parameters. generator = self.data_generator.generate(batch_size=batch_size, shuffle=False, transformations=transformations, label_encoder=None, returns={'processed_images', 'image_ids', 'evaluation-neutral', 'inverse_transform', 'original_labels'}, keep_images_without_gt=True, degenerate_box_handling='remove') if self.data_generator.image_ids is None: self.data_generator.image_ids = list(range(self.data_generator.get_dataset_size())) ############################################################################################# # Predict over all batches of the dataset and store the predictions. ############################################################################################# # We have to generate a separate results list for each class. results = [list() for _ in range(self.n_classes + 1)] # Create a dictionary that maps image IDs to ground truth annotations. # We'll need it below. image_ids_to_labels = {} # Compute the number of batches to iterate over the entire dataset. n_images = self.data_generator.get_dataset_size() n_batches = int(ceil(n_images / batch_size)) if verbose: print("Number of images in the evaluation dataset: {}".format(n_images)) print() tr = trange(n_batches, file=sys.stdout) tr.set_description('Producing predictions batch-wise') else: tr = range(n_batches) for j in tr: # Generate batch. batch_X, batch_image_ids, batch_eval_neutral, batch_inverse_transforms, batch_orig_labels = next(generator) # Predict. y_pred = self.model.predict(batch_X) # If the model was created in 'training' mode, the raw predictions need to # be decoded and filtered, otherwise that's already taken care of. if self.model_mode == 'training': # Decode. y_pred = decode_detections(y_pred, confidence_thresh=decoding_confidence_thresh, iou_threshold=decoding_iou_threshold, top_k=decoding_top_k, input_coords=decoding_pred_coords, normalize_coords=decoding_normalize_coords, img_height=img_height, img_width=img_width, border_pixels=decoding_border_pixels) else: # Filter out the all-zeros dummy elements of `y_pred`. y_pred_filtered = [] for i in range(len(y_pred)): y_pred_filtered.append(y_pred[i][y_pred[i,:,0] != 0]) y_pred = y_pred_filtered # Convert the predicted box coordinates for the original images. y_pred = apply_inverse_transforms(y_pred, batch_inverse_transforms) # Iterate over all batch items. for k, batch_item in enumerate(y_pred): image_id = batch_image_ids[k] path='/data/deeplearn/SWEIPENet/dataset/Detections/detection'+ str(self.modelindex) if not os.path.exists(path): os.mkdir(path) txtpath = os.path.join(path, image_id + '.txt') if not os.path.exists(txtpath): os.mknod(txtpath) file_fid = open(txtpath, 'w') for box in batch_item: class_id = int(box[class_id_pred]) # Round the box coordinates to reduce the required memory. if round_confidences: confidence = round(box[conf_pred], round_confidences) else: confidence = box[conf_pred] xmin = round(box[xmin_pred], 1) ymin = round(box[ymin_pred], 1) xmax = round(box[xmax_pred], 1) ymax = round(box[ymax_pred], 1) prediction = (image_id, confidence, xmin, ymin, xmax, ymax) # write detections of each image into Detections/imname.txt if class_id == 1: class_name = 'seacucumber' if class_id == 2: class_name = 'seaurchin' if class_id == 3: class_name = 'scallop' boxstr = class_name + ' ' + str(confidence)+ ' ' + str(xmin) + ' ' + str(ymin) + ' ' + str(xmax) + ' ' + str(ymax) file_fid.write(boxstr + '\n') results[class_id].append(prediction) file_fid.close() self.prediction_results = results if ret: return results
def predict_all_to_json(out_file, model, img_height, img_width, classes_to_cats, data_generator, batch_size, data_generator_mode='resize', model_mode='training', confidence_thresh=0.01, iou_threshold=0.45, top_k=200, pred_coords='centroids', normalize_coords=True): ''' Runs detection predictions over the whole dataset given a model and saves them in a JSON file in the MS COCO detection results format. Arguments: out_file (str): The file name (full path) under which to save the results JSON file. model (Keras model): A Keras SSD model object. img_height (int): The input image height for the model. img_width (int): The input image width for the model. classes_to_cats (dict): A dictionary that maps the consecutive class IDs predicted by the model to the non-consecutive original MS COCO category IDs. data_generator (DataGenerator): A `DataGenerator` object with the evaluation dataset. batch_size (int): The batch size for the evaluation. data_generator_mode (str, optional): Either of 'resize' or 'pad'. If 'resize', the input images will be resized (i.e. warped) to `(img_height, img_width)`. This mode does not preserve the aspect ratios of the images. If 'pad', the input images will be first padded so that they have the aspect ratio defined by `img_height` and `img_width` and then resized to `(img_height, img_width)`. This mode preserves the aspect ratios of the images. model_mode (str, optional): The mode in which the model was created, i.e. 'training', 'inference' or 'inference_fast'. This is needed in order to know whether the model output is already decoded or still needs to be decoded. Refer to the model documentation for the meaning of the individual modes. confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific positive class in order to be considered for the non-maximum suppression stage for the respective class. A lower value will result in a larger part of the selection process being done by the non-maximum suppression stage, while a larger value will result in a larger part of the selection process happening in the confidence thresholding stage. iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers to the box score. top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the non-maximum suppression stage. Defaults to 200, following the paper. input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`. normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1]) and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`. Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect coordinates. Requires `img_height` and `img_width` if set to `True`. Returns: None. ''' convert_to_3_channels = ConvertTo3Channels() resize = Resize(height=img_height, width=img_width) if data_generator_mode == 'resize': transformations = [convert_to_3_channels, resize] elif data_generator_mode == 'pad': random_pad = RandomPadFixedAR(patch_aspect_ratio=img_width / img_height, clip_boxes=False) transformations = [convert_to_3_channels, random_pad, resize] else: raise ValueError( "Unexpected argument value: `data_generator_mode` can be either of 'resize' or 'pad', but received '{}'." .format(data_generator_mode)) # Set the generator parameters. generator = data_generator.generate( batch_size=batch_size, shuffle=False, transformations=transformations, label_encoder=None, returns={'processed_images', 'image_ids', 'inverse_transform'}, keep_images_without_gt=True) # Put the results in this list. results = [] # Compute the number of batches to iterate over the entire dataset. n_images = data_generator.get_dataset_size() print("Number of images in the evaluation dataset: {}".format(n_images)) n_batches = int(ceil(n_images / batch_size)) # Loop over all batches. tr = trange(n_batches, file=sys.stdout) tr.set_description('Producing results file') for i in tr: # Generate batch. batch_X, batch_image_ids, batch_inverse_transforms = next(generator) # Predict. y_pred = model.predict(batch_X) # If the model was created in 'training' mode, the raw predictions need to # be decoded and filtered, otherwise that's already taken care of. if model_mode == 'training': # Decode. y_pred = decode_detections(y_pred, confidence_thresh=confidence_thresh, iou_threshold=iou_threshold, top_k=top_k, input_coords=pred_coords, normalize_coords=normalize_coords, img_height=img_height, img_width=img_width) else: # Filter out the all-zeros dummy elements of `y_pred`. y_pred_filtered = [] for i in range(len(y_pred)): y_pred_filtered.append(y_pred[i][y_pred[i, :, 0] != 0]) y_pred = y_pred_filtered # Convert the predicted box coordinates for the original images. y_pred = apply_inverse_transforms(y_pred, batch_inverse_transforms) # Convert each predicted box into the results format. for k, batch_item in enumerate(y_pred): for box in batch_item: class_id = box[0] # Transform the consecutive class IDs back to the original COCO category IDs. cat_id = classes_to_cats[class_id] # Round the box coordinates to reduce the JSON file size. xmin = float(round(box[2], 1)) ymin = float(round(box[3], 1)) xmax = float(round(box[4], 1)) ymax = float(round(box[5], 1)) width = xmax - xmin height = ymax - ymin bbox = [xmin, ymin, width, height] result = {} result['image_id'] = batch_image_ids[k] result['category_id'] = cat_id result['score'] = float(round(box[1], 3)) result['bbox'] = bbox results.append(result) with open(out_file, 'w') as f: json.dump(results, f) print("Prediction results saved in '{}'".format(out_file))
def run(train_dir, valid_dir, set_dir, model_dir): # train_dir = arguments.train_dir # valid_dir = arguments.valid_dir train_dataset_dir = train_dir train_annot_dir = train_dir + '/annot/' train_set = train_dir + '/img_set.txt' valid_dataset_dir = valid_dir valid_annot_dir = valid_dir + '/annot/' valid_set = valid_dir + '/valid_set.txt' # Set Training and Validation dataset paths batch_size = 16 print('Using batch size of: {}'.format(batch_size)) #model_path = 'COCO_512.h5' model_path = model_dir # model_path = 'saved_model.h5' # Needs to know classes and order to map to integers classes = ['background', 'car', 'bus', 'truck'] # Set required parameters for training of SSD img_height = 512 img_width = 512 img_channels = 3 # Colour image mean_color = [123, 117, 104] # DO NOT CHANGE swap_channels = [2, 1, 0] # Original SSD used BGR n_classes = 3 # 80 for COCO scales_coco = [0.04, 0.1, 0.26, 0.42, 0.58, 0.74, 0.9, 1.06] scales = scales_coco aspect_ratios = [[1.0, 2.0, 0.5], [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0], [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0], [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0], [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0], [1.0, 2.0, 0.5], [1.0, 2.0, 0.5]] two_boxes_for_ar1 = True steps = [8, 16, 32, 64, 128, 256, 512] offsets = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5] clip_boxes = False variances = [0.1, 0.1, 0.2, 0.2] normalize_coords = True K.clear_session() model = ssd_512(image_size=(img_height, img_width, img_channels), n_classes=n_classes, mode='training', l2_regularization=0.0005, scales=scales, aspect_ratios_per_layer=aspect_ratios, two_boxes_for_ar1=two_boxes_for_ar1, steps=steps, offsets=offsets, clip_boxes=clip_boxes, variances=variances, normalize_coords=normalize_coords, subtract_mean=mean_color, swap_channels=swap_channels) model.load_weights(model_path, by_name=True) sgd = SGD(lr=0.001, momentum=0.9, decay=0.0, nesterov=False) ssd_loss = SSDLoss(neg_pos_ratio=3, alpha=1.0) model.compile(optimizer=sgd, loss=ssd_loss.compute_loss) # model = load_model(model_path, custom_objects={'AnchorBoxes': AnchorBoxes, # 'L2Normalization': L2Normalization, # 'compute_loss': ssd_loss.compute_loss}) # Create Data Generators for train and valid sets train_dataset = DataGenerator(load_images_into_memory=False, hdf5_dataset_path=None) valid_dataset = DataGenerator(load_images_into_memory=False, hdf5_dataset_path=None) train_dataset.parse_xml(images_dirs=[train_dataset_dir], image_set_filenames=[train_set], annotations_dirs=[train_annot_dir], classes=classes, include_classes='all', exclude_truncated=False, exclude_difficult=False, ret=False) valid_dataset.parse_xml(images_dirs=[valid_dataset_dir], image_set_filenames=[valid_set], annotations_dirs=[valid_annot_dir], classes=classes, include_classes='all', exclude_truncated=False, exclude_difficult=False, ret=False) # Will speed up trainig but requires more memory # Can comment out to avoid memory requirements ''' train_dataset.create_hdf5_dataset(file_path='dataset_pascal_voc_07+12_trainval.h5', resize=False, variable_image_size=True, verbose=True) valid_dataset.create_hdf5_dataset(file_path='dataset_pascal_voc_07_test.h5', resize=False, variable_image_size=True, verbose=True) ''' ssd_data_augmentation = SSDDataAugmentation(img_height=img_height, img_width=img_width, background=mean_color) convert_to_3_channels = ConvertTo3Channels() resize = Resize(height=img_height, width=img_width) predictor_sizes = [ model.get_layer('conv4_3_norm_mbox_conf').output_shape[1:3], model.get_layer('fc7_mbox_conf').output_shape[1:3], model.get_layer('conv6_2_mbox_conf').output_shape[1:3], model.get_layer('conv7_2_mbox_conf').output_shape[1:3], model.get_layer('conv8_2_mbox_conf').output_shape[1:3], model.get_layer('conv9_2_mbox_conf').output_shape[1:3], model.get_layer('conv10_2_mbox_conf').output_shape[1:3] ] ssd_input_encoder = SSDInputEncoder(img_height=img_height, img_width=img_width, n_classes=n_classes, predictor_sizes=predictor_sizes, scales=scales, aspect_ratios_per_layer=aspect_ratios, two_boxes_for_ar1=two_boxes_for_ar1, steps=steps, offsets=offsets, clip_boxes=clip_boxes, variances=variances, matching_type='multi', pos_iou_threshold=0.5, neg_iou_limit=0.5, normalize_coords=normalize_coords) train_generator = train_dataset.generate( batch_size=batch_size, shuffle=True, transformations=[ssd_data_augmentation], label_encoder=ssd_input_encoder, returns={'processed_images', 'encoded_labels'}, keep_images_without_gt=False) val_generator = valid_dataset.generate( batch_size=batch_size, shuffle=False, transformations=[convert_to_3_channels, resize], label_encoder=ssd_input_encoder, returns={'processed_images', 'encoded_labels'}, keep_images_without_gt=False) # Get the number of samples in the training and validations datasets. train_dataset_size = train_dataset.get_dataset_size() valid_dataset_size = valid_dataset.get_dataset_size() print("Number of images in the training dataset:\t{:>6}".format( train_dataset_size)) print("Number of images in the validation dataset:\t{:>6}".format( valid_dataset_size)) model_checkpoint = ModelCheckpoint( filepath= 'ssd_epoch-{epoch:02d}_loss-{loss:.4f}_val_loss-{val_loss:.4f}.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1) #csv_logger = CSVLogger(filename='ssd512_training_log.csv', # separator=',', # append=True) learning_rate_scheduler = LearningRateScheduler(schedule=lr_schedule, verbose=1) terminate_on_nan = TerminateOnNaN() callbacks = [ model_checkpoint, csv_logger, learning_rate_scheduler, terminate_on_nan ] #callbacks = [learning_rate_scheduler, # terminate_on_nan] initial_epoch = 0 final_epoch = 150 # 150 steps_per_epoch = math.ceil(119 / batch_size) # ceil(num_samples/batch_size) # Training history = model.fit_generator(generator=train_generator, steps_per_epoch=steps_per_epoch, epochs=final_epoch, callbacks=callbacks, validation_data=val_generator, validation_steps=math.ceil( valid_dataset_size / batch_size), initial_epoch=initial_epoch) # Save final trained model model.save('trained.h5') # Make predictions predict_generator = valid_dataset.generate( batch_size=1, shuffle=True, transformations=[convert_to_3_channels, resize], label_encoder=None, returns={ 'processed_images', 'filenames', 'inverse_transform', 'original_images', 'original_labels' }, keep_images_without_gt=False) batch_images, batch_filenames, batch_inverse_transforms, batch_original_images, batch_original_labels = next( predict_generator) i = 0 # Which batch item to look at print("Image:", batch_filenames[i]) print() print("Ground truth boxes:\n") print(np.array(batch_original_labels[i])) y_pred = model.predict(batch_images) y_pred_decoded = decode_detections(y_pred, confidence_thresh=0.2, iou_threshold=0.4, top_k=200, normalize_coords=normalize_coords, img_height=img_height, img_width=img_width) y_pred_decoded_inv = apply_inverse_transforms(y_pred_decoded, batch_inverse_transforms) np.set_printoptions(precision=2, suppress=True, linewidth=90) print("Predicted boxes:\n") print(' class conf xmin ymin xmax ymax') print(y_pred_decoded_inv[i]) # Set the colors for the bounding boxes colors = plt.cm.hsv(np.linspace(0, 1, n_classes + 1)).tolist() # classes = ['background', 'car', 'bus', 'truck', 'motorbike'] # Already set at start plt.figure(figsize=(20, 12)) plt.imshow(batch_original_images[i]) current_axis = plt.gca() for box in batch_original_labels[i]: xmin = box[1] ymin = box[2] xmax = box[3] ymax = box[4] label = '{}'.format(classes[int(box[0])]) current_axis.add_patch( plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, color='green', fill=False, linewidth=2)) current_axis.text(xmin, ymin, label, size='x-large', color='white', bbox={ 'facecolor': 'green', 'alpha': 1.0 }) for box in y_pred_decoded_inv[i]: xmin = box[2] ymin = box[3] xmax = box[4] ymax = box[5] color = colors[int(box[0])] label = '{}: {:.2f}'.format(classes[int(box[0])], box[1]) current_axis.add_patch( plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, color=color, fill=False, linewidth=2)) current_axis.text(xmin, ymin, label, size='x-large', color='white', bbox={ 'facecolor': color, 'alpha': 1.0 }) plt.show() return
def predicting(images, image_path, labels_output_format=('class_id', 'xmin', 'ymin', 'xmax', 'ymax')): labels_format = { 'class_id': labels_output_format.index('class_id'), 'xmin': labels_output_format.index('xmin'), 'ymin': labels_output_format.index('ymin'), 'xmax': labels_output_format.index('xmax'), 'ymax': labels_output_format.index('ymax') } convert_to_3_channels = ConvertTo3Channels() resize = Resize(height=img_height, width=img_width) generate_pre = Data_Generator(image=images, image_path=image_path, labels_format=labels_format) predict_generator = generate_pre.generate( batch_size=1, transformations=[convert_to_3_channels, resize], label_encoder=None, returns={ 'processed_images', 'filenames', 'inverse_transform', 'original_images', 'original_labels' }, keep_images_without_gt=False) batch_images, batch_filenames, batch_inverse_transforms, batch_original_images, batch_original_labels = next( predict_generator) i = 0 print("Image:", "????") global graph with graph.as_default(): y_pred = model.predict(batch_images) y_pred_decoded = decode_detections(y_pred, confidence_thresh=0.25, iou_threshold=0.4, top_k=200, normalize_coords=normalize_coords, img_height=img_height, img_width=img_width) print(y_pred_decoded) y_pred_decoded_inv = apply_inverse_transforms(y_pred_decoded, batch_inverse_transforms) # print(y_pred_decoded_inv) np.set_printoptions(precision=2, suppress=True, linewidth=90) # print("Predicted boxes:\n") # print(' class conf xmin ymin xmax ymax') # print(y_pred_decoded_inv[i]) # Set the colors for the bounding boxes # plt.figure(figsize=(20, 12)) # plt.imshow(batch_original_images[i]) # colors = plt.cm.hsv(np.linspace(0, 1, n_classes + 1)).tolist() # current_axis = plt.gca() # # for box in y_pred_decoded_inv[i]: # xmin = box[2] # ymin = box[3] # xmax = box[4] # ymax = box[5] # # color = colors[int(box[0])] # # plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, color=color, fill=False, linewidth=2) # color = colors[int(box[0])] # label = '{}: {:.2f}'.format(classes[int(box[0])], box[1]) # current_axis.add_patch( # plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, color=color, fill=False, linewidth=2)) # current_axis.text(xmin, ymin, label, size='x-large', color='white', bbox={'facecolor': color, 'alpha': 1.0}) # # # plt.show() # plt.xticks([]) # plt.yticks([]) # plt.savefig(self.image_path) # K.clear_session() # gc.collect() return y_pred_decoded_inv
def __init__(self, random_brightness=(-48, 48, 0.5), random_contrast=(0.5, 1.8, 0.5), random_saturation=(0.5, 1.8, 0.5), random_hue=(18, 0.5), random_flip=0.5, random_translate=((0.03,0.5), (0.03,0.5), 0.5), random_scale=(0.5, 2.0, 0.5), n_trials_max=3, clip_boxes=False, overlap_criterion='area', bounds_box_filter=(0.3, 1.0), bounds_validator=(0.5, 1.0), n_boxes_min=1, background=(0,0,0), labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): if (random_scale[0] >= 1) or (random_scale[1] <= 1): raise ValueError("This sequence of transformations only makes sense if the minimum scaling factor is <1 and the maximum scaling factor is >1.") self.n_trials_max = n_trials_max self.clip_boxes = clip_boxes self.overlap_criterion = overlap_criterion self.bounds_box_filter = bounds_box_filter self.bounds_validator = bounds_validator self.n_boxes_min = n_boxes_min self.background = background self.labels_format = labels_format # Determines which boxes are kept in an image after the transformations have been applied. self.box_filter = BoxFilter(overlap_criterion=self.overlap_criterion, bounds=self.bounds_box_filter, labels_format=self.labels_format) # Determines whether the result of the transformations is a valid training image. self.image_validator = ImageValidator(overlap_criterion=self.overlap_criterion, bounds=self.bounds_validator, n_boxes_min=self.n_boxes_min, labels_format=self.labels_format) # Utility distortions self.convert_RGB_to_HSV = ConvertColor(current='RGB', to='HSV') self.convert_HSV_to_RGB = ConvertColor(current='HSV', to='RGB') self.convert_to_float32 = ConvertDataType(to='float32') self.convert_to_uint8 = ConvertDataType(to='uint8') self.convert_to_3_channels = ConvertTo3Channels() # Make sure all images end up having 3 channels. # Photometric transformations self.random_brightness = RandomBrightness(lower=random_brightness[0], upper=random_brightness[1], prob=random_brightness[2]) self.random_contrast = RandomContrast(lower=random_contrast[0], upper=random_contrast[1], prob=random_contrast[2]) self.random_saturation = RandomSaturation(lower=random_saturation[0], upper=random_saturation[1], prob=random_saturation[2]) self.random_hue = RandomHue(max_delta=random_hue[0], prob=random_hue[1]) # Geometric transformations self.random_flip = RandomFlip(dim='horizontal', prob=random_flip, labels_format=self.labels_format) self.random_translate = RandomTranslate(dy_minmax=random_translate[0], dx_minmax=random_translate[1], prob=random_translate[2], clip_boxes=self.clip_boxes, box_filter=self.box_filter, image_validator=self.image_validator, n_trials_max=self.n_trials_max, background=self.background, labels_format=self.labels_format) self.random_zoom_in = RandomScale(min_factor=1.0, max_factor=random_scale[1], prob=random_scale[2], clip_boxes=self.clip_boxes, box_filter=self.box_filter, image_validator=self.image_validator, n_trials_max=self.n_trials_max, background=self.background, labels_format=self.labels_format) self.random_zoom_out = RandomScale(min_factor=random_scale[0], max_factor=1.0, prob=random_scale[2], clip_boxes=self.clip_boxes, box_filter=self.box_filter, image_validator=self.image_validator, n_trials_max=self.n_trials_max, background=self.background, labels_format=self.labels_format) # If we zoom in, do translation before scaling. self.sequence1 = [self.convert_to_3_channels, self.convert_to_float32, self.random_brightness, self.random_contrast, self.convert_to_uint8, self.convert_RGB_to_HSV, self.convert_to_float32, self.random_saturation, self.random_hue, self.convert_to_uint8, self.convert_HSV_to_RGB, self.random_translate, self.random_zoom_in, self.random_flip] # If we zoom out, do scaling before translation. self.sequence2 = [self.convert_to_3_channels, self.convert_to_float32, self.random_brightness, self.convert_to_uint8, self.convert_RGB_to_HSV, self.convert_to_float32, self.random_saturation, self.random_hue, self.convert_to_uint8, self.convert_HSV_to_RGB, self.convert_to_float32, self.random_contrast, self.convert_to_uint8, self.random_zoom_out, self.random_translate, self.random_flip]
def predict_all_to_txt(model, img_height, img_width, data_generator, batch_size, data_generator_mode='resize', classes=[ 'background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor' ], out_file_prefix='comp3_det_test_', model_mode='training', confidence_thresh=0.01, iou_threshold=0.45, top_k=200, pred_coords='centroids', normalize_coords=True): ''' Runs detection predictions over the whole dataset given a model and saves them in a text file in the Pascal VOC detection results format, i.e. the format in which the Pascal VOC test server expects results. This will result in `n_classes` text files, where each file contains the predictions for one class. Arguments: model (Keras model): A Keras SSD model object. img_height (int): The input image height for the model. img_width (int): The input image width for the model. data_generator (DataGenerator): A `DataGenerator` object with the evaluation dataset. batch_size (int): The batch size for the evaluation. data_generator_mode (str, optional): Either of 'resize' or 'pad'. If 'resize', the input images will be resized (i.e. warped) to `(img_height, img_width)`. This mode does not preserve the aspect ratios of the images. If 'pad', the input images will be first padded so that they have the aspect ratio defined by `img_height` and `img_width` and then resized to `(img_height, img_width)`. This mode preserves the aspect ratios of the images. classes (list or dict, optional): A list or dictionary maps the consecutive class IDs predicted by the model their respective name strings. The list must contain the background class for class ID zero. out_file_prefix (str, optional): A prefix for the output text file names. The suffix to each output text file name will be the respective class name followed by the `.txt` file extension. This string is also how you specify the directory in which the results are to be saved. model_mode (str, optional): The mode in which the model was created, i.e. 'training', 'inference' or 'inference_fast'. This is needed in order to know whether the model output is already decoded or still needs to be decoded. Refer to the model documentation for the meaning of the individual modes. confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific positive class in order to be considered for the non-maximum suppression stage for the respective class. A lower value will result in a larger part of the selection process being done by the non-maximum suppression stage, while a larger value will result in a larger part of the selection process happening in the confidence thresholding stage. iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers to the box score. top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the non-maximum suppression stage. Defaults to 200, following the paper. input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`. normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1]) and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`. Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect coordinates. Requires `img_height` and `img_width` if set to `True`. Returns: None. ''' convert_to_3_channels = ConvertTo3Channels() resize = Resize(height=img_height, width=img_width) if data_generator_mode == 'resize': transformations = [convert_to_3_channels, resize] elif data_generator_mode == 'pad': random_pad = RandomPadFixedAR(patch_aspect_ratio=img_width / img_height, clip_boxes=False) transformations = [convert_to_3_channels, random_pad, resize] else: raise ValueError( "Unexpected argument value: `data_generator_mode` can be either of 'resize' or 'pad', but received '{}'." .format(data_generator_mode)) # Set the generator parameters. generator = data_generator.generate( batch_size=batch_size, shuffle=False, transformations=transformations, label_encoder=None, returns={'processed_images', 'image_ids', 'inverse_transform'}, keep_images_without_gt=True) # We have to generate a separate results file for each class. results = [] for i in range(1, len(classes)): # Create one text file per class and put it in our results list. results.append( open('{}{}.txt'.format(out_file_prefix, classes[i]), 'w')) # Compute the number of batches to iterate over the entire dataset. n_images = data_generator.get_dataset_size() print("Number of images in the evaluation dataset: {}".format(n_images)) n_batches = int(ceil(n_images / batch_size)) # Loop over all batches. tr = trange(n_batches, file=sys.stdout) tr.set_description('Producing results files') for j in tr: # Generate batch. batch_X, batch_image_ids, batch_inverse_transforms = next(generator) # Predict. y_pred = model.predict(batch_X) # If the model was created in 'training' mode, the raw predictions need to # be decoded and filtered, otherwise that's already taken care of. if model_mode == 'training': # Decode. y_pred = decode_y(y_pred, confidence_thresh=confidence_thresh, iou_threshold=iou_threshold, top_k=top_k, input_coords=pred_coords, normalize_coords=normalize_coords, img_height=img_height, img_width=img_width) else: # Filter out the all-zeros dummy elements of `y_pred`. y_pred_filtered = [] for i in range(len(y_pred)): y_pred_filtered.append(y_pred[i][y_pred[i, :, 0] != 0]) y_pred = y_pred_filtered # Convert the predicted box coordinates for the original images. y_pred = apply_inverse_transforms(y_pred, batch_inverse_transforms) # Convert each predicted box into the results format. for k, batch_item in enumerate(y_pred): for box in batch_item: image_id = batch_image_ids[k] class_id = int(box[0]) # Round the box coordinates to reduce the file size. confidence = str(round(box[1], 4)) xmin = str(round(box[2], 1)) ymin = str(round(box[3], 1)) xmax = str(round(box[4], 1)) ymax = str(round(box[5], 1)) prediction = [image_id, confidence, xmin, ymin, xmax, ymax] prediction_txt = ' '.join(prediction) + '\n' # Write the predicted box to the text file for its class. results[class_id - 1].write(prediction_txt) # Close all the files. for results_file in results: results_file.close() print("All results files saved.")
def ssd_model(config: Dict, train_dataset, val_dataset, callbacks_list): """Training SSD model Parameters ---------- config : Dict Config yaml/json containing all parameter """ start_train = timer() img_height = config['training']['img_height'] # Height input images img_width = config['training']['img_width'] # Width input images img_channels = config['training'][ 'img_channels'] # Number of color channels n_classes = config['training'][ 'n_classes'] # Number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO model = ssd_300(image_size=(img_height, img_width, img_channels), n_classes=n_classes, mode='training', l2_regularization=config['training']['l2_regularization'], scales=scales, aspect_ratios_per_layer=aspect_ratios, two_boxes_for_ar1=two_boxes_for_ar1, steps=steps, offsets=offsets, clip_boxes=clip_boxes, variances=variances, normalize_coords=normalize_coords, subtract_mean=mean_color, swap_channels=swap_channels) weights_path = './weights/VGG_ILSVRC_16_layers_fc_reduced.h5' model.load_weights(weights_path, by_name=True) adam = Adam(lr=config['training']['learning_rate'], beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) ssd_loss = SSDLoss(neg_pos_ratio=3, alpha=1.0) model.compile(optimizer=adam, loss=ssd_loss.compute_loss) batch_size = config['training']['batch_size'] ssd_data_augmentation = SSDDataAugmentation(img_height=img_height, img_width=img_width, background=mean_color) convert_to_3_channels = ConvertTo3Channels() resize = Resize(height=img_height, width=img_width) predictor_sizes = [ model.get_layer('conv4_3_norm_mbox_conf').output_shape[1:3], model.get_layer('fc7_mbox_conf').output_shape[1:3], model.get_layer('conv6_2_mbox_conf').output_shape[1:3], model.get_layer('conv7_2_mbox_conf').output_shape[1:3], model.get_layer('conv8_2_mbox_conf').output_shape[1:3], model.get_layer('conv9_2_mbox_conf').output_shape[1:3] ] ssd_input_encoder = SSDInputEncoder(img_height=img_height, img_width=img_width, n_classes=n_classes, predictor_sizes=predictor_sizes, scales=scales, aspect_ratios_per_layer=aspect_ratios, two_boxes_for_ar1=two_boxes_for_ar1, steps=steps, offsets=offsets, clip_boxes=clip_boxes, variances=variances, matching_type='multi', pos_iou_threshold=0.5, neg_iou_limit=0.5, normalize_coords=normalize_coords) train_generator = train_dataset.generate( batch_size=batch_size, shuffle=True, transformations=[ssd_data_augmentation], label_encoder=ssd_input_encoder, returns={'processed_images', 'encoded_labels'}, keep_images_without_gt=False) val_generator = val_dataset.generate( batch_size=batch_size, shuffle=False, transformations=[convert_to_3_channels, resize], label_encoder=ssd_input_encoder, returns={'processed_images', 'encoded_labels'}, keep_images_without_gt=False) # Get the number of samples in the training and validations datasets. train_dataset_size = train_dataset.get_dataset_size() val_dataset_size = val_dataset.get_dataset_size() print( f"[INFO]...Number of images in the training dataset: {train_dataset_size}" ) print( f"[INFO]...Number of images in the validation dataset: {val_dataset_size}" ) print( f"[INFO]...Weights will be saved at {config['training']['weight_save_path']}" ) history = model.fit_generator( generator=train_generator, steps_per_epoch=config['training']['steps_per_epoch'], epochs=config['training']['epochs'], callbacks=callbacks_list, validation_data=val_generator, validation_steps=ceil(val_dataset_size / batch_size)) end_train = timer() print( f"[INFO]...Total time taken by Training Job is {(end_train - start_train)/60:.2f} min(s)" )
def train_VOC(config): ''' Train the given configuration ; the configuration must be constructed according to the utility script found in utils/generateconfig.py. Arguments: config : the configuration of the model to use ; should already be loaded ''' ################################### ### PATHS AND PARAMETERS ################################## datadir = config.DATA_DIR local_dir = config.ROOT_FOLDER img_shape = config.IMG_SHAPE classes = config.CLASSES checkpoint_output = os.path.join(local_dir, 'models', config.CHECKPOINT_NAME) model_output = os.path.join(local_dir, 'models', config.MODEL_NAME) img_height = img_shape[0] # Height of the model input images img_width = img_shape[1] # Width of the model input images img_channels = img_shape[ 2] # Number of color channels of the model input images mean_color = [ 123, 117, 104 ] # The per-channel mean of the images in the dataset. Do not change this value if you're using any of the pre-trained weights. swap_channels = [ 2, 1, 0 ] # The color channel order in the original SSD is BGR, so we'll have the model reverse the color channel order of the input images. n_classes = 20 # Number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO scales_pascal = [ 0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05 ] # The anchor box scaling factors used in the original SSD300 for the Pascal VOC datasets scales = scales_pascal aspect_ratios = [ [1.0, 2.0, 0.5], [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0], [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0], [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0 ], [1.0, 2.0, 0.5 ], [1.0, 2.0, 0.5] ] # The anchor box aspect ratios used in the original SSD300; the order matters two_boxes_for_ar1 = True steps = [ 8, 16, 32, 64, 100, 300 ] # The space between two adjacent anchor box center points for each predictor layer. offsets = [ 0.5, 0.5, 0.5, 0.5, 0.5, 0.5 ] # The offsets of the first anchor box center points from the top and left borders of the image as a fraction of the step size for each predictor layer. clip_boxes = False # Whether or not to clip the anchor boxes to lie entirely within the image boundaries variances = [ 0.1, 0.1, 0.2, 0.2 ] # The variances by which the encoded target coordinates are divided as in the original implementation normalize_coords = True batch_size = config.BATCH_SIZE # Change the batch size if you like, or if you run into GPU memory issues. ################################### ### BUILDING MODEL ################################## K.clear_session() # Clear previous models from memory. model = ssd_300(image_size=(img_height, img_width, img_channels), n_classes=n_classes, mode='training', l2_regularization=0.0005, scales=scales, aspect_ratios_per_layer=aspect_ratios, two_boxes_for_ar1=two_boxes_for_ar1, steps=steps, offsets=offsets, clip_boxes=clip_boxes, variances=variances, normalize_coords=normalize_coords, subtract_mean=mean_color, swap_channels=swap_channels) weights_path = os.path.join(local_dir, 'weights', 'VGG_VOC0712_SSD_300x300_iter_120000.h5') model.load_weights(weights_path, by_name=True) #adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) sgd = SGD(lr=0.001, momentum=0.9, decay=0.0, nesterov=False) ssd_loss = SSDLoss(neg_pos_ratio=3, alpha=1.0) model.compile(optimizer=sgd, loss=ssd_loss.compute_loss) ################################### ### LOADING DATAS ################################## train_dataset = DataGenerator(load_images_into_memory=False, hdf5_dataset_path=None) val_dataset = DataGenerator(load_images_into_memory=False, hdf5_dataset_path=None) images_dir = os.path.join(datadir, 'Images') annotations_dir = os.path.join(datadir, 'Annotations') trainval_image_set_filename = os.path.join(datadir, 'ImageSets', 'train.txt') test_image_set_filename = os.path.join(datadir, 'ImageSets', 'val.txt') # The XML parser needs to now what object class names to look for and in which order to map them to integers. # train_dataset.parse_xml(images_dirs=[images_dir], image_set_filenames=[trainval_image_set_filename], annotations_dirs=[annotations_dir], classes=classes, include_classes='all', exclude_truncated=False, exclude_difficult=False, ret=False) val_dataset.parse_xml(images_dirs=[images_dir], image_set_filenames=[test_image_set_filename], annotations_dirs=[annotations_dir], classes=classes, include_classes='all', exclude_truncated=False, exclude_difficult=True, ret=False) train_dataset.create_hdf5_dataset(file_path='flowers_train.h5', resize=False, variable_image_size=True, verbose=True) val_dataset.create_hdf5_dataset(file_path='flowers_val.h5', resize=False, variable_image_size=True, verbose=True) ssd_data_augmentation = SSDDataAugmentation(img_height=img_height, img_width=img_width, background=mean_color) convert_to_3_channels = ConvertTo3Channels() resize = Resize(height=img_height, width=img_width) # The encoder constructor needs the spatial dimensions of the model's predictor layers to create the anchor boxes. predictor_sizes = [ model.get_layer('conv4_3_norm_mbox_conf').output_shape[1:3], model.get_layer('fc7_mbox_conf').output_shape[1:3], model.get_layer('conv6_2_mbox_conf').output_shape[1:3], model.get_layer('conv7_2_mbox_conf').output_shape[1:3], model.get_layer('conv8_2_mbox_conf').output_shape[1:3], model.get_layer('conv9_2_mbox_conf').output_shape[1:3] ] ssd_input_encoder = SSDInputEncoder(img_height=img_height, img_width=img_width, n_classes=n_classes, predictor_sizes=predictor_sizes, scales=scales, aspect_ratios_per_layer=aspect_ratios, two_boxes_for_ar1=two_boxes_for_ar1, steps=steps, offsets=offsets, clip_boxes=clip_boxes, variances=variances, matching_type='multi', pos_iou_threshold=0.5, neg_iou_limit=0.5, normalize_coords=normalize_coords) train_generator = train_dataset.generate( batch_size=batch_size, shuffle=True, transformations=[ssd_data_augmentation], label_encoder=ssd_input_encoder, returns={'processed_images', 'encoded_labels'}, keep_images_without_gt=False) val_generator = val_dataset.generate( batch_size=batch_size, shuffle=False, transformations=[convert_to_3_channels, resize], label_encoder=ssd_input_encoder, returns={'processed_images', 'encoded_labels'}, keep_images_without_gt=False) # Get the number of samples in the training and validations datasets. train_dataset_size = train_dataset.get_dataset_size() val_dataset_size = val_dataset.get_dataset_size() print("Number of images in the training dataset:\t{:>6}".format( train_dataset_size)) print("Number of images in the validation dataset:\t{:>6}".format( val_dataset_size)) ################################### ### PREPARE TRAINING ################################## def lr_schedule(epoch): if epoch < 80: return 0.001 elif epoch < 100: return 0.0001 else: return 0.00001 model_checkpoint = ModelCheckpoint(filepath=checkpoint_output, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1) early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.0, patience=10, verbose=1) learning_rate_scheduler = LearningRateScheduler(schedule=lr_schedule, verbose=1) terminate_on_nan = TerminateOnNaN() callbacks = [ model_checkpoint, learning_rate_scheduler, terminate_on_nan, early_stopping ] ################################### ### TRAINING ################################## epochs = config.EPOCHS steps_per_epoch = ceil(train_dataset_size / batch_size) model.summary() history = model.fit_generator(generator=train_generator, steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks, validation_data=val_generator, validation_steps=ceil(val_dataset_size / batch_size)) model.save(model_output)
def load_VOC_IMG_generators(self,model): print('Making VOC image generators') datadir = self.datas['DATA_PATH'] train_dataset = DataGenerator(load_images_into_memory=False, hdf5_dataset_path=None) val_dataset = DataGenerator(load_images_into_memory=False, hdf5_dataset_path=None) test_dataset = DataGenerator(load_images_into_memory=False, hdf5_dataset_path=None) images_dir = os.path.join(datadir,'Images') annotations_dir = os.path.join(datadir,'Annotations') train_image_set_filename = os.path.join(datadir,'ImageSets','train.txt') val_image_set_filename = os.path.join(datadir,'ImageSets','val.txt') test_image_set_filename = os.path.join(datadir,'ImageSets','test.txt') generator_options = self.datas['GENERATOR'] train_dataset.parse_xml(images_dirs=[images_dir], image_set_filenames=[train_image_set_filename], annotations_dirs=[annotations_dir], classes=self.datas['CLASSES'], include_classes='all', exclude_truncated=False, exclude_difficult=False, ret=False) val_dataset.parse_xml(images_dirs=[images_dir], image_set_filenames=[val_image_set_filename], annotations_dirs=[annotations_dir], classes=self.datas['CLASSES'], include_classes='all', exclude_truncated=False, exclude_difficult=False, ret=False) test_dataset.parse_xml(images_dirs=[images_dir], image_set_filenames=[test_image_set_filename], annotations_dirs=[annotations_dir], classes=self.datas['CLASSES'], include_classes='all', exclude_truncated=False, exclude_difficult=False, ret=False) convert_to_3_channels = ConvertTo3Channels() target_size = generator_options['TARGET_SIZE'] resize = Resize(height=target_size[0], width=target_size[1]) predictor_sizes = [model.get_layer('conv4_3_norm_mbox_conf').output_shape[1:3], model.get_layer('fc7_mbox_conf').output_shape[1:3], model.get_layer('conv6_2_mbox_conf').output_shape[1:3], model.get_layer('conv7_2_mbox_conf').output_shape[1:3], model.get_layer('conv8_2_mbox_conf').output_shape[1:3], model.get_layer('conv9_2_mbox_conf').output_shape[1:3]] scales_pascal = [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05] # The anchor box scaling factors used in the original SSD300 for the Pascal VOC datasets scales = scales_pascal aspect_ratios = [[1.0, 2.0, 0.5], [1.0, 2.0, 0.5, 3.0, 1.0/3.0], [1.0, 2.0, 0.5, 3.0, 1.0/3.0], [1.0, 2.0, 0.5, 3.0, 1.0/3.0], [1.0, 2.0, 0.5], [1.0, 2.0, 0.5]] # The anchor box aspect ratios used in the original SSD300; the order matters steps = [8, 16, 32, 64, 100, 300] # The space between two adjacent anchor box center points for each predictor layer. two_boxes_for_ar1 = True mean_color=[123,117,104] #TODO : add this as a parameter offsets = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5] clip_boxes=False variances=[0.1, 0.1, 0.2, 0.2] normalize_coords=True ssd_input_encoder = SSDInputEncoder(img_height = target_size[0], img_width = target_size[1], n_classes = 20, #TODO : handle subsampling predictor_sizes=predictor_sizes, scales=scales, aspect_ratios_per_layer=aspect_ratios, two_boxes_for_ar1=two_boxes_for_ar1, steps=steps, offsets=offsets, clip_boxes=clip_boxes, variances=variances, matching_type='multi', pos_iou_threshold=0.5, neg_iou_limit=0.5, normalize_coords=normalize_coords ) train_generator = train_dataset.generate(batch_size=generator_options['BATCH_SIZE'], shuffle=True, transformations=[convert_to_3_channels, resize], label_encoder=ssd_input_encoder, returns={'processed_images', 'encoded_labels'}, keep_images_without_gt=False) val_generator = val_dataset.generate(batch_size=generator_options['BATCH_SIZE'], shuffle=True, transformations=[convert_to_3_channels, resize], label_encoder=ssd_input_encoder, returns={'processed_images', 'encoded_labels'}, keep_images_without_gt=False) test_generator = test_dataset.generate(batch_size=generator_options['BATCH_SIZE'], shuffle=True, transformations=[convert_to_3_channels, resize], label_encoder=ssd_input_encoder, returns={'processed_images', 'encoded_labels'}, keep_images_without_gt=False) return [train_generator,train_dataset.get_dataset_size()],[val_generator,val_dataset.get_dataset_size()],[test_generator,train_dataset.get_dataset_size()]