Пример #1
0
def train(args):
    model = ssd_300(mode='training',
                    image_size=(img_height, img_width, img_channels),
                    n_classes=n_classes,
                    l2_regularization=0.0005,
                    scales=scales,
                    aspect_ratios_per_layer=aspect_ratios,
                    two_boxes_for_ar1=two_boxes_for_ar1,
                    steps=steps,
                    offsets=offsets,
                    limit_boxes=limit_boxes,
                    variances=variances,
                    coords=coords,
                    normalize_coords=normalize_coords,
                    subtract_mean=subtract_mean,
                    divide_by_stddev=None,
                    swap_channels=swap_channels)

    print(model.summary())

    predictor_sizes = [model.get_layer('conv11_mbox_conf').output_shape[1:3],
                       model.get_layer('conv13_mbox_conf').output_shape[1:3],
                       model.get_layer('conv14_2_mbox_conf').output_shape[1:3],
                       model.get_layer('conv15_2_mbox_conf').output_shape[1:3],
                       model.get_layer('conv16_2_mbox_conf').output_shape[1:3],
                       model.get_layer('conv17_2_mbox_conf').output_shape[1:3]]

    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=5e-04)

    ssd_loss = SSDLoss(neg_pos_ratio=3, n_neg_min=0, alpha=1.0)

    model.compile(optimizer=adam, loss=ssd_loss.compute_loss, metrics=["accuracy"])

    train_dataset = BatchGenerator(box_output_format=['class_id', 'xmin', 'ymin', 'xmax', 'ymax'])
    val_dataset = BatchGenerator(box_output_format=['class_id', 'xmin', 'ymin', 'xmax', 'ymax'])
    # 2: Parse the image and label lists for the training and validation datasets. This can take a while.

    # TODO: Set the paths to the datasets here.

    VOC_2007_images_dir = args.voc_dir_path + '/VOC2007/JPEGImages/'

    # The directories that contain the annotations.
    VOC_2007_annotations_dir = args.voc_dir_path + '/VOC2007/Annotations/'

    # The paths to the image sets.
    VOC_2007_train_image_set_filename = args.voc_dir_path + '/VOC2007/ImageSets/Layout/trainval.txt'

    VOC_2007_val_image_set_filename = args.voc_dir_path + '/VOC2007/ImageSets/Main/test.txt'

    # The XML parser needs to now what object class names to look for and in which order to map them to integers.

    classes = ['background',
               'aeroplane', 'bicycle', 'bird', 'boat',
               'bottle', 'bus', 'car', 'cat',
               'chair', 'cow', 'diningtable', 'dog',
               'horse', 'motorbike', 'person', 'pottedplant',
               'sheep', 'sofa', 'train', 'tvmonitor']

    train_dataset.parse_xml(images_dirs=[VOC_2007_images_dir],
                            image_set_filenames=[VOC_2007_train_image_set_filename],
                            annotations_dirs=[VOC_2007_annotations_dir],
                            classes=classes,
                            include_classes='all',
                            exclude_truncated=False,
                            exclude_difficult=False,
                            ret=False)

    val_dataset.parse_xml(images_dirs=[VOC_2007_images_dir],
                          image_set_filenames=[VOC_2007_val_image_set_filename],
                          annotations_dirs=[VOC_2007_annotations_dir],
                          classes=classes,
                          include_classes='all',
                          exclude_truncated=False,
                          exclude_difficult=False,
                          ret=False
                          )

    # 3: Instantiate an encoder that can encode ground truth labels into the format needed by the SSD loss function.

    ssd_box_encoder = SSDBoxEncoder(img_height=img_height,
                                    img_width=img_width,
                                    n_classes=n_classes,
                                    predictor_sizes=predictor_sizes,
                                    min_scale=None,
                                    max_scale=None,
                                    scales=scales,
                                    aspect_ratios_global=None,
                                    aspect_ratios_per_layer=aspect_ratios,
                                    two_boxes_for_ar1=two_boxes_for_ar1,
                                    steps=steps,
                                    offsets=offsets,
                                    limit_boxes=limit_boxes,
                                    variances=variances,
                                    pos_iou_threshold=0.5,
                                    neg_iou_threshold=0.2,
                                    coords=coords,
                                    normalize_coords=normalize_coords)

    batch_size = args.batch_size

    train_generator = train_dataset.generate(batch_size=batch_size,
                                             shuffle=True,
                                             train=True,
                                             ssd_box_encoder=ssd_box_encoder,
                                             convert_to_3_channels=True,
                                             equalize=False,
                                             brightness=(0.5, 2, 0.5),
                                             flip=0.5,
                                             translate=False,
                                             scale=False,
                                             max_crop_and_resize=(img_height, img_width, 1, 3),
                                             # This one is important because the Pascal VOC images vary in size
                                             random_pad_and_resize=(img_height, img_width, 1, 3, 0.5),
                                             # This one is important because the Pascal VOC images vary in size
                                             random_crop=False,
                                             crop=False,
                                             resize=False,
                                             gray=False,
                                             limit_boxes=True,
                                             # While the anchor boxes are not being clipped, the ground truth boxes should be
                                             include_thresh=0.4)

    val_generator = val_dataset.generate(batch_size=batch_size, shuffle=True, train=True,
                                         ssd_box_encoder=ssd_box_encoder, convert_to_3_channels=True, equalize=False,
                                         brightness=(0.5, 2, 0.5), flip=0.5, translate=False, scale=False,
                                         max_crop_and_resize=(img_height, img_width, 1, 3),
                                         random_pad_and_resize=(img_height, img_width, 1, 3, 0.5), random_crop=False,
                                         crop=False, resize=False, gray=False, limit_boxes=True, include_thresh=0.4)
    tmp_slice = next(
        val_dataset.generate(batch_size=batch_size, shuffle=True, train=True, ssd_box_encoder=ssd_box_encoder,
                             convert_to_3_channels=True, equalize=False, brightness=(0.5, 2, 0.5), flip=0.5,
                             translate=False, scale=False, max_crop_and_resize=(img_height, img_width, 1, 3),
                             random_pad_and_resize=(img_height, img_width, 1, 3, 0.5), random_crop=False, crop=False,
                             resize=False, gray=False, limit_boxes=True, include_thresh=0.4))
    print(tmp_slice[1].shape)

    # Get the number of samples in the training and validations datasets to compute the epoch lengths below.

    def lr_schedule(epoch):
        if epoch <= 300:
            return 0.001
        else:
            return 0.0001

    learning_rate_scheduler = LearningRateScheduler(schedule=lr_schedule)

    checkpoint_path = "ssd300_epoch-{epoch:02d}.h5"

    checkpoint = ModelCheckpoint(checkpoint_path)

    log_path = "logs"

    callbacks = [checkpoint, learning_rate_scheduler]

    # TODO: Set the number of epochs to train for.
    epochs = args.epochs
    intial_epoch = args.intial_epoch

    history = model.fit_generator(generator=train_generator,
                                  steps_per_epoch=args.iterations_per_epoch,
                                  verbose=1,
                                  initial_epoch=intial_epoch,
                                  epochs=epochs,
                                  validation_data=val_generator,
                                  validation_steps=2,
                                  callbacks=callbacks
                                  )

    with open('model_architecture.json', 'w') as f:
        f.write(model.to_json())

    print('History:', history)

    weights_file = sorted(glob('*.h5'))[-1]

    tf.disable_eager_execution()
    tf.compat.v1.reset_default_graph()
    tf.keras.backend.set_learning_phase(0)
    model = ssd_300(mode='training',
                    image_size=(img_height, img_width, img_channels),
                    n_classes=n_classes,
                    l2_regularization=0.0005,
                    scales=scales,
                    aspect_ratios_per_layer=aspect_ratios,
                    two_boxes_for_ar1=two_boxes_for_ar1,
                    steps=steps,
                    offsets=offsets,
                    limit_boxes=limit_boxes,
                    variances=variances,
                    coords=coords,
                    normalize_coords=normalize_coords,
                    subtract_mean=subtract_mean,
                    divide_by_stddev=None,
                    swap_channels=swap_channels)

    model.load_weights(weights_file, by_name=True, skip_mismatch=True)

    sess = tf.compat.v1.keras.backend.get_session()
    saver = tf.compat.v1.train.Saver()
    if not os.path.exists('checkpoint'):
        os.mkdir('checkpoint')
    saver.save(sess, 'checkpoint/model')

    FileWriter('graph', sess.graph)
Пример #2
0
def test(args):
    model.load_weights(args.weight_file)
    dataset = BatchGenerator(
        box_output_format=['class_id', 'xmin', 'ymin', 'xmax', 'ymax'])

    test_images_dir = args.dir_path + '/img'
    test_image_set_filename = args.dir_path + '/test.txt'

    # The XML parser needs to now what object class names to look for and in which order to map them to integers.
    classes = ['neutral', 'anger', 'surprise', 'smile', 'sad']

    filenames, labels, image_ids = dataset.parse_xml(
        images_dirs=[test_images_dir],
        image_set_filenames=[test_image_set_filename],
        annotations_dirs=None,
        classes=classes,
        include_classes='all',
        exclude_truncated=False,
        exclude_difficult=False,
        ret=True)

    size = len(filenames)

    for i in range(size):

        image_path = filenames[i]
        ima = cv2.imread(image_path)
        orig_images = []

        orig_images.append(ima)

        image1 = cv2.resize(ima, (IMG_HEIGHT, IMG_WIDTH))
        image1 = image1[np.newaxis, :, :, :]

        input_images = np.array(image1)
        start_time = time.time()
        y_pred = model.predict(input_images)
        print("Time Taken by ssd", time.time() - start_time)

        y_pred_decoded = decode_y(y_pred,
                                  confidence_thresh=0.01,
                                  iou_threshold=0.45,
                                  top_k=18,
                                  input_coords='centroids',
                                  normalize_coords=True,
                                  img_height=IMG_HEIGHT,
                                  img_width=IMG_WIDTH)

        pred_boxes = []
        pred_labels = []
        scores = []

        for box in y_pred_decoded[0]:
            xmin = int(box[-4] * orig_images[0].shape[1] / IMG_WIDTH)
            ymin = int(box[-3] * orig_images[0].shape[0] / IMG_HEIGHT)
            xmax = int(box[-2] * orig_images[0].shape[1] / IMG_WIDTH)
            ymax = int(box[-1] * orig_images[0].shape[0] / IMG_HEIGHT)
            class_id = int(box[0])
            score = box[1]
            pred_boxes.append([xmin, ymin, xmax, ymax])
            pred_labels.append(class_id)
            scores.append(score)

        pred_boxes = np.array(pred_boxes)
        pred_labels = np.array(pred_labels)
        top4_idx = np.argsort(scores)[::-1][:4]

        pred_boxes = pred_boxes[top4_idx]
        pred_labels = pred_labels[top4_idx]

        draw_box_and_label(image_path, ima, pred_boxes, pred_labels, classes)
Пример #3
0
def train(args):
    model = ssd_300(mode='training',
                    image_size=(IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS),
                    n_classes=n_classes,
                    l2_regularization=0.0005,
                    scales=scales,
                    aspect_ratios_per_layer=aspect_ratios,
                    two_boxes_for_ar1=two_boxes_for_ar1,
                    steps=steps,
                    offsets=offsets,
                    limit_boxes=limit_boxes,
                    variances=variances,
                    coords=coords,
                    normalize_coords=normalize_coords,
                    subtract_mean=subtract_mean,
                    divide_by_stddev=None,
                    swap_channels=swap_channels)

    model.load_weights(args.weight_file, by_name=True, skip_mismatch=True)

    predictor_sizes = [
        model.get_layer('conv11_mbox_conf').output_shape[1:3],
        model.get_layer('conv13_mbox_conf').output_shape[1:3],
        model.get_layer('conv14_2_mbox_conf').output_shape[1:3],
        model.get_layer('conv15_2_mbox_conf').output_shape[1:3],
        model.get_layer('conv16_2_mbox_conf').output_shape[1:3],
        model.get_layer('conv17_2_mbox_conf').output_shape[1:3]
    ]

    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=5e-04)

    ssd_loss = SSDLoss(neg_pos_ratio=3, n_neg_min=0, alpha=1.0)

    model.compile(optimizer=adam, loss=ssd_loss.compute_loss)

    train_dataset = BatchGenerator(
        box_output_format=['class_id', 'xmin', 'ymin', 'xmax', 'ymax'])
    val_dataset = BatchGenerator(
        box_output_format=['class_id', 'xmin', 'ymin', 'xmax', 'ymax'])

    # 2: Parse the image and label lists for the training and validation datasets. This can take a while.

    # Set the paths to the datasets here.

    images_dir = args.dir_path + '/img'
    annotations_dir = args.dir_path + '/annotations'
    train_image_set_filename = args.dir_path + '/trainval.txt'
    val_image_set_filename = args.dir_path + '/val.txt'

    classes = ['neutral', 'anger', 'surprise', 'smile', 'sad']

    train_dataset.parse_xml(images_dirs=[images_dir],
                            image_set_filenames=[train_image_set_filename],
                            annotations_dirs=[annotations_dir],
                            classes=classes,
                            include_classes='all',
                            exclude_truncated=False,
                            exclude_difficult=False,
                            ret=False)

    val_dataset.parse_xml(images_dirs=[images_dir],
                          image_set_filenames=[val_image_set_filename],
                          annotations_dirs=[annotations_dir],
                          classes=classes,
                          include_classes='all',
                          exclude_truncated=False,
                          exclude_difficult=False,
                          ret=False)

    # 3: Instantiate an encoder that can encode ground truth labels into the format needed by the SSD loss function.

    ssd_box_encoder = SSDBoxEncoder(img_height=IMG_HEIGHT,
                                    img_width=IMG_WIDTH,
                                    n_classes=n_classes,
                                    predictor_sizes=predictor_sizes,
                                    min_scale=None,
                                    max_scale=None,
                                    scales=scales,
                                    aspect_ratios_global=None,
                                    aspect_ratios_per_layer=aspect_ratios,
                                    two_boxes_for_ar1=two_boxes_for_ar1,
                                    steps=steps,
                                    offsets=offsets,
                                    limit_boxes=limit_boxes,
                                    variances=variances,
                                    pos_iou_threshold=0.5,
                                    neg_iou_threshold=0.2,
                                    coords=coords,
                                    normalize_coords=normalize_coords)

    batch_size = args.batch_size

    train_generator = train_dataset.generate(
        batch_size=batch_size,
        shuffle=True,
        train=True,
        ssd_box_encoder=ssd_box_encoder,
        convert_to_3_channels=True,
        equalize=False,
        brightness=(0.5, 2, 0.5),
        flip=0.5,
        translate=False,
        scale=False,
        max_crop_and_resize=(IMG_HEIGHT, IMG_WIDTH, 1, 3),
        # This one is important because the Pascal VOC images vary in size
        random_pad_and_resize=(IMG_HEIGHT, IMG_WIDTH, 1, 3, 0.5),
        # This one is important because the Pascal VOC images vary in size
        random_crop=False,
        crop=False,
        resize=False,
        gray=False,
        limit_boxes=True,
        # While the anchor boxes are not being clipped, the ground truth boxes should be
        include_thresh=0.4)

    val_generator = val_dataset.generate(
        batch_size=batch_size,
        shuffle=True,
        train=True,
        ssd_box_encoder=ssd_box_encoder,
        convert_to_3_channels=True,
        equalize=False,
        brightness=(0.5, 2, 0.5),
        flip=0.5,
        translate=False,
        scale=False,
        max_crop_and_resize=(IMG_HEIGHT, IMG_WIDTH, 1, 3),
        # This one is important because the Pascal VOC images vary in size
        random_pad_and_resize=(IMG_HEIGHT, IMG_WIDTH, 1, 3, 0.5),
        # This one is important because the Pascal VOC images vary in size
        random_crop=False,
        crop=False,
        resize=False,
        gray=False,
        limit_boxes=True,
        # While the anchor boxes are not being clipped, the ground truth boxes should be
        include_thresh=0.4)

    # Get the number of samples in the training and validations datasets to compute the epoch legnths below.
    n_train_samples = train_dataset.get_n_samples()
    n_val_samples = val_dataset.get_n_samples()

    def lr_schedule(epoch):
        if epoch <= 300:
            return 0.001
        else:
            return 0.0001

    learning_rate_scheduler = LearningRateScheduler(schedule=lr_schedule)
    checkpoint_path = args.checkpoint_path + '/ssd300_epoch-{epoch:02d}.h5'
    checkpoint = ModelCheckpoint(checkpoint_path)
    log_path = args.checkpoint_path + '/logs'

    tensorboard = TensorBoard(log_dir=log_path,
                              histogram_freq=0,
                              write_graph=True,
                              write_images=False)

    callbacks = [checkpoint, tensorboard, learning_rate_scheduler]

    epochs = args.epochs
    initial_epoch = args.initial_epoch

    history = model.fit_generator(
        generator=train_generator,
        steps_per_epoch=ceil(n_train_samples) / batch_size,
        verbose=1,
        initial_epoch=initial_epoch,
        epochs=epochs,
        validation_data=val_generator,
        validation_steps=ceil(n_val_samples) / batch_size,
        callbacks=callbacks)
Пример #4
0
def train(args):
    model = small_ssd(mode='training',
                      alpha=0.25,
                      image_size=(img_height, img_width, img_channels),
                      n_classes=n_classes,
                      l2_regularization=0.0005,
                      scales=scales,
                      aspect_ratios_per_layer=aspect_ratios,
                      two_boxes_for_ar1=False,
                      steps=None,
                      offsets=None,
                      limit_boxes=limit_boxes,
                      variances=variances,
                      coords=coords,
                      normalize_coords=normalize_coords,
                      subtract_mean=subtract_mean,
                      divide_by_stddev=None,
                      swap_channels=swap_channels)

    predictor_sizes = [
        model.get_layer('feature_layer_1_mbox_conf').output_shape[1:3],
        model.get_layer('feature_layer_2_mbox_conf').output_shape[1:3],
    ]

    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=5e-04)

    ssd_loss = SSDLoss(neg_pos_ratio=3, n_neg_min=0, alpha=1.0)

    model.compile(optimizer=adam, loss=ssd_loss.compute_loss)

    train_dataset = BatchGenerator(
        box_output_format=['class_id', 'xmin', 'ymin', 'xmax', 'ymax'])
    val_dataset = BatchGenerator(
        box_output_format=['class_id', 'xmin', 'ymin', 'xmax', 'ymax'])

    # 2: Parse the image and label lists for the training and validation datasets. This can take a while.

    # TODO: Set the paths to the datasets here.

    #Create list of all image dirs
    train_images_dirs = []
    train_annotations_dirs = []
    train_filenames = []
    val_images_dirs = []
    val_annotations_dirs = []
    val_filenames = []
    if (data_type == "sonar"):
        if (sonar_range == "range5000"):
            base_im_path = "/projects/mines/working_mount/processed_sonar/new_data"
            base_an_path = "/projects/mines/Josh/mines_ground_truth/sonar/range5000"
            base_filenames_path = "/projects/mines/Josh/mines_file_names"
            #base_im_path = "/Users/Josh/processed_sonar"
            #base_an_path = "/Users/Josh/mines_ground_truth/sonar/range5000"
            #base_filenames_path = "/Users/Josh/mines_file_names"
            all_datasets = datasets_train + datasets_val
            for ds in all_datasets:
                #im_ds_path = "{}/k-8".format(ds)
                im_ds_path = "{}/range5000/k-8".format(ds)
                im_path = "{}/{}".format(base_im_path, im_ds_path)
                an_path = "{}/{}".format(base_an_path, ds)
                file_name_path = "{}/sonar_ds_{}_list.txt".format(
                    base_filenames_path, ds)
                if ds in datasets_train:
                    train_images_dirs.append(im_path)
                    train_annotations_dirs.append(an_path)
                    train_filenames.append(file_name_path)
                else:
                    val_images_dirs.append(im_path)
                    val_annotations_dirs.append(an_path)
                    val_filenames.append(file_name_path)

    # The XML parser needs to now what object class names to look for and in which order to map them to integers.

    classes = ['background', 'mine']

    train_dataset.parse_xml(images_dirs=train_images_dirs,
                            image_set_filenames=train_filenames,
                            annotations_dirs=train_annotations_dirs,
                            classes=classes,
                            include_classes='all',
                            exclude_truncated=False,
                            exclude_difficult=False,
                            ret=False,
                            data_type="sonar")

    val_dataset.parse_xml(images_dirs=val_images_dirs,
                          image_set_filenames=val_filenames,
                          annotations_dirs=val_annotations_dirs,
                          classes=classes,
                          include_classes='all',
                          exclude_truncated=False,
                          exclude_difficult=False,
                          ret=False,
                          data_type="sonar")

    # 3: Instantiate an encoder that can encode ground truth labels into the format needed by the SSD loss function.

    ssd_box_encoder = SSDBoxEncoder(img_height=img_height,
                                    img_width=img_width,
                                    n_classes=n_classes,
                                    predictor_sizes=predictor_sizes,
                                    min_scale=None,
                                    max_scale=None,
                                    scales=scales,
                                    aspect_ratios_global=None,
                                    aspect_ratios_per_layer=aspect_ratios,
                                    two_boxes_for_ar1=two_boxes_for_ar1,
                                    steps=None,
                                    offsets=None,
                                    limit_boxes=limit_boxes,
                                    variances=variances,
                                    pos_iou_threshold=0.5,
                                    neg_iou_threshold=0.2,
                                    coords=coords,
                                    normalize_coords=normalize_coords)

    batch_size = args.batch_size

    train_generator = train_dataset.generate(
        batch_size=batch_size,
        shuffle=True,
        train=True,
        ssd_box_encoder=ssd_box_encoder,
        convert_to_3_channels=True,
        equalize=False,
        brightness=False,
        flip=0.5,
        translate=False,
        scale=False,
        max_crop_and_resize=(img_height, img_width, 1, 3),
        # This one is important because the Pascal VOC images vary in size
        random_pad_and_resize=(img_height, img_width, 1, 3, 0.5),
        # This one is important because the Pascal VOC images vary in size
        random_crop=False,
        crop=False,
        resize=False,
        gray=False,
        limit_boxes=True,
        # While the anchor boxes are not being clipped, the ground truth boxes should be
        include_thresh=0.4)

    val_generator = val_dataset.generate(
        batch_size=batch_size,
        shuffle=True,
        train=True,
        ssd_box_encoder=ssd_box_encoder,
        convert_to_3_channels=True,
        equalize=False,
        brightness=False,
        flip=0.5,
        translate=False,
        scale=False,
        max_crop_and_resize=(img_height, img_width, 1, 3),
        # This one is important because the Pascal VOC images vary in size
        random_pad_and_resize=(img_height, img_width, 1, 3, 0.5),
        # This one is important because the Pascal VOC images vary in size
        random_crop=False,
        crop=False,
        resize=False,
        gray=False,
        limit_boxes=True,
        # While the anchor boxes are not being clipped, the ground truth boxes should be
        include_thresh=0.4)

    # Get the number of samples in the training and validations datasets to compute the epoch lengths below.
    n_train_samples = train_dataset.get_n_samples()
    n_val_samples = val_dataset.get_n_samples()

    def lr_schedule(epoch):
        if epoch <= 300:
            return 0.001
        else:
            return 0.0001

    learning_rate_scheduler = LearningRateScheduler(schedule=lr_schedule)

    checkpoint_path = args.checkpoint_path + "/ssd300_epoch-{epoch:02d}.h5"

    checkpoint = ModelCheckpoint(checkpoint_path)

    log_path = args.checkpoint_path + "/logs"

    tensorborad = TensorBoard(log_dir=log_path,
                              histogram_freq=0,
                              write_graph=True,
                              write_images=False)

    callbacks = [checkpoint, tensorborad, learning_rate_scheduler]

    # TODO: Set the number of epochs to train for.
    epochs = args.epochs
    intial_epoch = args.intial_epoch

    history = model.fit_generator(
        generator=train_generator,
        steps_per_epoch=ceil(n_train_samples) / batch_size,
        verbose=1,
        initial_epoch=intial_epoch,
        epochs=epochs,
        validation_data=val_generator,
        validation_steps=ceil(n_val_samples) / batch_size,
        callbacks=callbacks)
Пример #5
0
def train(args):
    model = ssd_300(mode='training',
                    image_size=(img_height, img_width, img_channels),
                    n_classes=n_classes,
                    l2_regularization=0.0005,
                    scales=scales,
                    aspect_ratios_per_layer=aspect_ratios,
                    two_boxes_for_ar1=two_boxes_for_ar1,
                    steps=steps,
                    offsets=offsets,
                    limit_boxes=limit_boxes,
                    variances=variances,
                    coords=coords,
                    normalize_coords=normalize_coords,
                    subtract_mean=subtract_mean,
                    divide_by_stddev=None,
                    swap_channels=swap_channels)

    model.load_weights(args.weight_file, by_name=True, skip_mismatch=True)

    predictor_sizes = [
        model.get_layer('conv11_mbox_conf').output_shape[1:3],
        model.get_layer('conv13_mbox_conf').output_shape[1:3],
        model.get_layer('conv14_2_mbox_conf').output_shape[1:3],
        model.get_layer('conv15_2_mbox_conf').output_shape[1:3],
        model.get_layer('conv16_2_mbox_conf').output_shape[1:3],
        model.get_layer('conv17_2_mbox_conf').output_shape[1:3]
    ]

    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=5e-04)

    ssd_loss = SSDLoss(neg_pos_ratio=3, n_neg_min=0, alpha=1.0)

    model.compile(optimizer=adam, loss=ssd_loss.compute_loss)

    train_dataset = BatchGenerator(
        box_output_format=['class_id', 'xmin', 'ymin', 'xmax', 'ymax'])
    val_dataset = BatchGenerator(
        box_output_format=['class_id', 'xmin', 'ymin', 'xmax', 'ymax'])

    # 2: Parse the image and label lists for the training and validation datasets. This can take a while.

    # TODO: Set the paths to the datasets here.

    VOC_2007_images_dir = args.voc_dir_path + '/VOC2007/JPEGImages/'
    VOC_2012_images_dir = args.voc_dir_path + '/VOC2012/JPEGImages/'

    # The directories that contain the annotations.
    VOC_2007_annotations_dir = args.voc_dir_path + '/VOC2007/Annotations/'
    VOC_2012_annotations_dir = args.voc_dir_path + '/VOC2012/Annotations/'

    # The paths to the image sets.
    VOC_2007_train_image_set_filename = args.voc_dir_path + '/VOC2007/ImageSets/Main/trainval.txt'
    VOC_2012_train_image_set_filename = args.voc_dir_path + '/VOC2012/ImageSets/Main/trainval.txt'

    VOC_2007_val_image_set_filename = args.voc_dir_path + '/VOC2007/ImageSets/Main/test.txt'

    # The XML parser needs to now what object class names to look for and in which order to map them to integers.

    classes = [
        'background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
        'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
        'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
        'tvmonitor'
    ]

    train_dataset.parse_xml(
        images_dirs=[VOC_2007_images_dir, VOC_2012_images_dir],
        image_set_filenames=[
            VOC_2007_train_image_set_filename,
            VOC_2012_train_image_set_filename
        ],
        annotations_dirs=[VOC_2007_annotations_dir, VOC_2012_annotations_dir],
        classes=classes,
        include_classes='all',
        exclude_truncated=False,
        exclude_difficult=False,
        ret=False)

    val_dataset.parse_xml(
        images_dirs=[VOC_2007_images_dir],
        image_set_filenames=[VOC_2007_val_image_set_filename],
        annotations_dirs=[VOC_2007_annotations_dir],
        classes=classes,
        include_classes='all',
        exclude_truncated=False,
        exclude_difficult=False,
        ret=False)

    # 3: Instantiate an encoder that can encode ground truth labels into the format needed by the SSD loss function.

    ssd_box_encoder = SSDBoxEncoder(img_height=img_height,
                                    img_width=img_width,
                                    n_classes=n_classes,
                                    predictor_sizes=predictor_sizes,
                                    min_scale=None,
                                    max_scale=None,
                                    scales=scales,
                                    aspect_ratios_global=None,
                                    aspect_ratios_per_layer=aspect_ratios,
                                    two_boxes_for_ar1=two_boxes_for_ar1,
                                    steps=steps,
                                    offsets=offsets,
                                    limit_boxes=limit_boxes,
                                    variances=variances,
                                    pos_iou_threshold=0.5,
                                    neg_iou_threshold=0.2,
                                    coords=coords,
                                    normalize_coords=normalize_coords)

    batch_size = args.batch_size

    train_generator = train_dataset.generate(
        batch_size=batch_size,
        shuffle=True,
        train=True,
        ssd_box_encoder=ssd_box_encoder,
        convert_to_3_channels=True,
        equalize=False,
        brightness=(0.5, 2, 0.5),
        flip=0.5,
        translate=False,
        scale=False,
        max_crop_and_resize=(img_height, img_width, 1, 3),
        # This one is important because the Pascal VOC images vary in size
        random_pad_and_resize=(img_height, img_width, 1, 3, 0.5),
        # This one is important because the Pascal VOC images vary in size
        random_crop=False,
        crop=False,
        resize=False,
        gray=False,
        limit_boxes=True,  # Clip ground truth boxes
        include_thresh=0.4)

    val_generator = val_dataset.generate(
        batch_size=batch_size,
        shuffle=True,
        train=True,
        ssd_box_encoder=ssd_box_encoder,
        convert_to_3_channels=True,
        equalize=False,
        brightness=(0.5, 2, 0.5),
        flip=0.5,
        translate=False,
        scale=False,
        max_crop_and_resize=(img_height, img_width, 1, 3),
        # This one is important because the Pascal VOC images vary in size
        random_pad_and_resize=(img_height, img_width, 1, 3, 0.5),
        # This one is important because the Pascal VOC images vary in size
        random_crop=False,
        crop=False,
        resize=False,
        gray=False,
        limit_boxes=True,  # Clip ground truth boxes
        include_thresh=0.4)
    # Get the number of samples in the training and validations datasets to compute the epoch lengths below.
    n_train_samples = train_dataset.get_n_samples()
    n_val_samples = val_dataset.get_n_samples()

    def lr_schedule(epoch):
        if epoch <= 300:
            return 0.001
        else:
            return 0.0001

    learning_rate_scheduler = LearningRateScheduler(schedule=lr_schedule)

    checkpoint_path = args.checkpoint_path + "/ssd300_epoch-{epoch:02d}.h5"

    checkpoint = ModelCheckpoint(checkpoint_path)

    log_path = args.checkpoint_path + "/logs"

    tensorborad = TensorBoard(log_dir=log_path,
                              histogram_freq=0,
                              write_graph=True,
                              write_images=False)

    callbacks = [checkpoint, tensorborad, learning_rate_scheduler]

    # TODO: Set the number of epochs to train for.
    epochs = args.epochs
    initial_epoch = args.initial_epoch

    history = model.fit_generator(
        generator=train_generator,
        steps_per_epoch=ceil(n_train_samples) / batch_size,
        verbose=1,
        initial_epoch=initial_epoch,
        epochs=epochs,
        validation_data=val_generator,
        validation_steps=ceil(n_val_samples) / batch_size,
        callbacks=callbacks)
def train(args):
    model = ssd_300(mode='training',
                    image_size=(img_height, img_width, img_channels),
                    n_classes=n_classes,
                    l2_regularization=0.0005,
                    scales=scales,
                    aspect_ratios_per_layer=aspect_ratios,
                    two_boxes_for_ar1=two_boxes_for_ar1,
                    steps=steps,
                    offsets=offsets,
                    limit_boxes=limit_boxes,
                    variances=variances,
                    coords=coords,
                    normalize_coords=normalize_coords,
                    subtract_mean=subtract_mean,
                    divide_by_stddev=None,
                    swap_channels=swap_channels)

    model.load_weights(args.weight_file, by_name=True, skip_mismatch=True)

    predictor_sizes = [
        model.get_layer('conv11_mbox_conf').output_shape[1:3],
        model.get_layer('conv13_mbox_conf').output_shape[1:3],
        model.get_layer('conv14_2_mbox_conf').output_shape[1:3],
        model.get_layer('conv15_2_mbox_conf').output_shape[1:3],
        model.get_layer('conv16_2_mbox_conf').output_shape[1:3],
        model.get_layer('conv17_2_mbox_conf').output_shape[1:3]
    ]

    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=5e-04)

    ssd_loss = SSDLoss(neg_pos_ratio=3, n_neg_min=0, alpha=1.0)

    model.compile(optimizer=adam, loss=ssd_loss.compute_loss)

    train_dataset = BatchGenerator(
        box_output_format=['class_id', 'xmin', 'ymin', 'xmax', 'ymax'])
    val_dataset = BatchGenerator(
        box_output_format=['class_id', 'xmin', 'ymin', 'xmax', 'ymax'])

    # 2: Parse the image and label lists for the training and validation datasets. This can take a while.

    # TODO: Set the paths to the datasets here.

    COCO_format_val_images_dir = args.ms_coco_dir_path + '/val/'
    COCO_format_train_images_dir = args.ms_coco_dir_path + '/train/'
    COCO_format_train_annotation_dir = args.ms_coco_dir_path + '/annotations/train.json'
    COCO_format_val_annotation_dir = args.ms_coco_dir_path + '/annotations/val.json'

    VOC_2007_images_dir = args.voc_dir_path + '/VOC2007/JPEGImages/'
    VOC_2012_images_dir = args.voc_dir_path + '/VOC2012/JPEGImages/'

    # The directories that contain the annotations.
    VOC_2007_annotations_dir = args.voc_dir_path + '/VOC2007/Annotations/'
    VOC_2012_annotations_dir = args.voc_dir_path + '/VOC2012/Annotations/'

    # The paths to the image sets.
    VOC_2007_train_image_set_filename = args.voc_dir_path + '/VOC2007/ImageSets/Main/trainval.txt'
    VOC_2012_train_image_set_filename = args.voc_dir_path + '/VOC2012/ImageSets/Main/trainval.txt'

    VOC_2007_val_image_set_filename = args.voc_dir_path + '/VOC2007/ImageSets/Main/test.txt'

    # The XML parser needs to now what object class names to look for and in which order to map them to integers.

    classes = [
        'background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
        'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
        'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
        'tvmonitor'
    ]
    '''
          This is an JSON parser for the MS COCO datasets. It might be applicable to other datasets with minor changes to
          the code, but in its current form it expects the JSON format of the MS COCO datasets.

          Arguments:
              images_dirs (list, optional): A list of strings, where each string is the path of a directory that
                  contains images that are to be part of the dataset. This allows you to aggregate multiple datasets
                  into one (e.g. one directory that contains the images for MS COCO Train 2014, another one for MS COCO
                  Val 2014, another one for MS COCO Train 2017 etc.).
              annotations_filenames (list): A list of strings, where each string is the path of the JSON file
                  that contains the annotations for the images in the respective image directories given, i.e. one
                  JSON file per image directory that contains the annotations for all images in that directory.
                  The content of the JSON files must be in MS COCO object detection format. Note that these annotations
                  files do not necessarily need to contain ground truth information. MS COCO also provides annotations
                  files without ground truth information for the test datasets, called `image_info_[...].json`.
              ground_truth_available (bool, optional): Set `True` if the annotations files contain ground truth information.
              include_classes (list, optional): Either 'all' or a list of integers containing the class IDs that
                  are to be included in the dataset. Defaults to 'all', in which case all boxes will be included
                  in the dataset.
              ret (bool, optional): Whether or not the image filenames and labels are to be returned.

          Returns:
              None by default, optionally the image filenames and labels.
          '''

    train_dataset.parse_json(
        images_dirs=[COCO_format_train_images_dir],
        annotations_filenames=[COCO_format_train_annotation_dir],
        ground_truth_available=True,
        include_classes='all',
        ret=False)

    val_dataset.parse_json(
        images_dirs=[COCO_format_val_images_dir],
        annotations_filenames=[COCO_format_val_annotation_dir],
        ground_truth_available=True,
        include_classes='all',
        ret=False)

    # 3: Instantiate an encoder that can encode ground truth labels into the format needed by the SSD loss function.

    ssd_box_encoder = SSDBoxEncoder(img_height=img_height,
                                    img_width=img_width,
                                    n_classes=n_classes,
                                    predictor_sizes=predictor_sizes,
                                    min_scale=None,
                                    max_scale=None,
                                    scales=scales,
                                    aspect_ratios_global=None,
                                    aspect_ratios_per_layer=aspect_ratios,
                                    two_boxes_for_ar1=two_boxes_for_ar1,
                                    steps=steps,
                                    offsets=offsets,
                                    limit_boxes=limit_boxes,
                                    variances=variances,
                                    pos_iou_threshold=0.5,
                                    neg_iou_threshold=0.2,
                                    coords=coords,
                                    normalize_coords=normalize_coords)

    batch_size = args.batch_size

    train_generator = train_dataset.generate(
        batch_size=batch_size,
        shuffle=True,
        train=True,
        ssd_box_encoder=ssd_box_encoder,
        convert_to_3_channels=True,
        equalize=False,
        brightness=(0.5, 2, 0.5),
        flip=0.5,
        translate=False,
        scale=False,
        max_crop_and_resize=(img_height, img_width, 1, 3),
        # This one is important because the Pascal VOC images vary in size
        random_pad_and_resize=(img_height, img_width, 1, 3, 0.5),
        # This one is important because the Pascal VOC images vary in size
        random_crop=False,
        crop=False,
        resize=False,
        gray=False,
        limit_boxes=True,
        # While the anchor boxes are not being clipped, the ground truth boxes should be
        include_thresh=0.4)

    val_generator = val_dataset.generate(
        batch_size=batch_size,
        shuffle=True,
        train=True,
        ssd_box_encoder=ssd_box_encoder,
        convert_to_3_channels=True,
        equalize=False,
        brightness=(0.5, 2, 0.5),
        flip=0.5,
        translate=False,
        scale=False,
        max_crop_and_resize=(img_height, img_width, 1, 3),
        # This one is important because the Pascal VOC images vary in size
        random_pad_and_resize=(img_height, img_width, 1, 3, 0.5),
        # This one is important because the Pascal VOC images vary in size
        random_crop=False,
        crop=False,
        resize=False,
        gray=False,
        limit_boxes=True,
        # While the anchor boxes are not being clipped, the ground truth boxes should be
        include_thresh=0.4)
    # Get the number of samples in the training and validations datasets to compute the epoch lengths below.
    n_train_samples = train_dataset.get_n_samples()
    n_val_samples = val_dataset.get_n_samples()

    def lr_schedule(epoch):
        if epoch <= 300:
            return 0.001
        else:
            return 0.0001

    learning_rate_scheduler = LearningRateScheduler(schedule=lr_schedule)

    checkpoint_path = args.checkpoint_path + "/ssd300_epoch-{epoch:02d}.h5"

    checkpoint = ModelCheckpoint(checkpoint_path)

    log_path = args.checkpoint_path + "/logs"

    tensorborad = TensorBoard(log_dir=log_path,
                              histogram_freq=0,
                              write_graph=True,
                              write_images=False)

    callbacks = [checkpoint, tensorborad, learning_rate_scheduler]

    # TODO: Set the number of epochs to train for.
    epochs = args.epochs
    intial_epoch = args.intial_epoch

    history = model.fit_generator(
        generator=train_generator,
        steps_per_epoch=ceil(n_train_samples) / batch_size,
        verbose=1,
        initial_epoch=intial_epoch,
        epochs=epochs,
        validation_data=val_generator,
        validation_steps=ceil(n_val_samples) / batch_size,
        callbacks=callbacks)
def main(args):
    model.load_weights(args.weight_file)
    dataset = BatchGenerator(box_output_format=['class_id', 'xmin', 'ymin', 'xmax', 'ymax'])

    VOC_2007_images_dir = args.voc_dir_path + '/VOC2007/JPEGImages/'
    VOC_2012_images_dir = args.voc_dir_path + '/VOC2012/JPEGImages/'

    # The directories that contain the annotations.
    VOC_2007_annotations_dir = args.voc_dir_path + '/VOC2007/Annotations/'
    VOC_2012_annotations_dir = args.voc_dir_path + '/VOC2012/Annotations/'

    # The paths to the image sets.
    VOC_2007_train_image_set_filename = args.voc_dir_path + '/VOC2007/ImageSets/Main/trainval.txt'
    VOC_2012_train_image_set_filename = args.voc_dir_path + '/VOC2012/ImageSets/Main/trainval.txt'

    VOC_2007_val_image_set_filename = args.voc_dir_path + '/VOC2007/ImageSets/Main/test.txt'
    # VOC_2012_val_image_set_filename = '/media/shareit/manish/blitznet-master/Datasets/VOCdevkit/VOC2012/ImageSets/Main/test.txt'


    # The XML parser needs to now what object class names to look for and in which order to map them to integers.
    classes = ['background',
               'aeroplane', 'bicycle', 'bird', 'boat',
               'bottle', 'bus', 'car', 'cat',
               'chair', 'cow', 'diningtable', 'dog',
               'horse', 'motorbike', 'person', 'pottedplant',
               'sheep', 'sofa', 'train', 'tvmonitor']


    filenames, labels, image_ids = dataset.parse_xml(images_dirs=[VOC_2007_images_dir],
                                                     image_set_filenames=[VOC_2007_val_image_set_filename],
                                                     annotations_dirs=[VOC_2007_annotations_dir],
                                                     classes=classes,
                                                     include_classes='all',
                                                     exclude_truncated=False,
                                                     exclude_difficult=False,
                                                     ret=True)

    size = len(filenames)
    detected_labels = []


    all_detections = [[None for i in range(len(classes))] for j in range(size)]
    all_annotations = [[None for i in range(len(classes))] for j in range(size)]

    for i in range(size):

        image_path = filenames[i]
        ima = cv2.imread(image_path)
        orig_images = []

        orig_images.append(ima)

        image1 = cv2.resize(ima,(img_height,img_width))
        image1 = image1[np.newaxis,:,:,:]

        input_images = np.array(image1)


        start_time = time.time()
        y_pred = model.predict(input_images)
        print "Time Taken by ssd", time.time() - start_time

        y_pred_decoded = decode_y(y_pred,
                                  confidence_thresh=0.01,
                                  iou_threshold=0.45,
                                  top_k=100,
                                  input_coords='centroids',
                                  normalize_coords=True,
                                  img_height=img_height,
                                  img_width=img_width)

        pred_boxes = []
        pred_labels = []

        for box in y_pred_decoded[0]:

            xmin = int(box[-4] * orig_images[0].shape[1] / img_width)
            ymin = int(box[-3] * orig_images[0].shape[0] / img_height)
            xmax = int(box[-2] * orig_images[0].shape[1] / img_width)
            ymax = int(box[-1] * orig_images[0].shape[0] / img_height)
            class_id = int(box[0])
            score = box[1]

            pred_boxes.append([xmin, ymin, xmax, ymax, score])

            pred_labels.append(class_id)

        pred_boxes = np.array(pred_boxes)
        pred_labels = np.array(pred_labels)

        l = range(1, len(classes))
        for label in l:
            if(len(pred_labels)):
                all_detections[i][label] = pred_boxes[pred_labels == label, :]

        true_label = np.array(labels[i])
        
        for label in l:
            if len(true_label) > 0:
                all_annotations[i][label] = true_label[true_label[:, 0] == label, 1:5].copy()
            else:
                all_annotations[i][label] = np.array([[]])

    average_precisions = {}


    for label in l:
        false_positives = np.zeros((0,))
        true_positives = np.zeros((0,))
        scores = np.zeros((0,))
        num_annotations = 0.0

        for i in range(size):
            annotations = all_annotations[i][label]
            annotations = annotations.astype(np.float32)


            num_annotations += annotations.shape[0]
            detected_annotations = []
            detections = all_detections[i][label]
            if(detections is not None):
                detections = detections.astype(np.float32)

                for d in detections:
                    scores = np.append(scores, d[4])

                    try:
                        annotations[0][0]
                    except IndexError:
                        false_positives = np.append(false_positives, 1)
                        true_positives = np.append(true_positives, 0)
                        continue

                    overlaps = compute_overlap(np.expand_dims(d, axis=0), annotations)
                    assigned_annotation = np.argmax(overlaps, axis=1)
                    max_overlap = overlaps[0, assigned_annotation]
                    
                    if max_overlap >= conf_threshold and assigned_annotation not in detected_annotations:
                    
                        false_positives = np.append(false_positives, 0)
                        true_positives = np.append(true_positives, 1)
                        detected_annotations.append(assigned_annotation)
                    else:
                        false_positives = np.append(false_positives, 1)
                        true_positives = np.append(true_positives, 0)

        
        if num_annotations == 0:
            average_precisions[label] = 0
            continue
        indices = np.argsort(-scores)
        false_positives = false_positives[indices]
        true_positives = true_positives[indices]

        false_positives = np.cumsum(false_positives)
        true_positives = np.cumsum(true_positives)

        recall = true_positives / num_annotations
        
        precision = true_positives / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps)

        average_precision = compute_ap(recall, precision)
        average_precisions[label] = average_precision


    count = 0
    for k in average_precisions.keys():
        count  = count + float(average_precisions[k])



    map = count/len(l)
    print average_precisions
    print 'MAP is :' , map