def index_mapping_file(self, larger_one, smaller_one): indexes = [] items_l = read_class_names(larger_one, dot_name_file=False) items_s = read_class_names(smaller_one, dot_name_file=False) for item in items_s: indexes.append(items_l.index(item)) return indexes
def __init__(self, dataset_type, NEW_CLASSES_TO_LEARN, TOTAL_CLASSES_WILL_KNOW_AFTER_THIS): self.annot_path = TRAIN_ANNOT_PATH if dataset_type == 'train' else TEST_ANNOT_PATH self.input_sizes = TRAIN_INPUT_SIZE if dataset_type == 'train' else TEST_INPUT_SIZE self.batch_size = TRAIN_BATCH_SIZE if dataset_type == 'train' else TEST_BATCH_SIZE self.data_aug = TRAIN_DATA_AUG if dataset_type == 'train' else TEST_DATA_AUG self.train_input_sizes = TRAIN_INPUT_SIZE self.strides = np.array(YOLO_STRIDES) self.classes = read_class_names(TOTAL_CLASSES_WILL_KNOW_AFTER_THIS, dot_name_file=False) self.num_classes = len(self.classes) self.anchors = (np.array(YOLO_ANCHORS).T / self.strides).T self.anchor_per_scale = YOLO_ANCHOR_PER_SCALE self.max_bbox_per_scale = YOLO_MAX_BBOX_PER_SCALE self.annotations = self.load_annotations(dataset_type) self.num_samples = len(self.annotations) self.num_batchs = int(np.ceil(self.num_samples / self.batch_size)) self.TOTAL_CLASSES_WILL_KNOW_AFTER_THIS = TOTAL_CLASSES_WILL_KNOW_AFTER_THIS self.new_classes = self.index_mapping_file( PASCAL_VOC_ALL_CLASSES, NEW_CLASSES_TO_LEARN) # [3,8] self.annotation_goes = 0 self.prev_aug_image = 0
def __init__(self, FLAGS, is_training: bool, dataset_type: str = "converted_coco"): self.tiny = FLAGS.tiny self.strides, self.anchors, NUM_CLASS, XYSCALE = utils.load_config( FLAGS) self.dataset_type = dataset_type self.annot_path = (cfg.TRAIN.ANNOT_PATH if is_training else cfg.TEST.ANNOT_PATH) self.input_sizes = (cfg.TRAIN.INPUT_SIZE if is_training else cfg.TEST.INPUT_SIZE) self.batch_size = (cfg.TRAIN.BATCH_SIZE if is_training else cfg.TEST.BATCH_SIZE) self.data_aug = cfg.TRAIN.DATA_AUG if is_training else cfg.TEST.DATA_AUG self.train_input_sizes = cfg.TRAIN.INPUT_SIZE self.classes = utils.read_class_names(cfg.YOLO.CLASSES) self.num_classes = len(self.classes) self.anchor_per_scale = cfg.YOLO.ANCHOR_PER_SCALE self.max_bbox_per_scale = 150 self.annotations = self.load_annotations() self.num_samples = len(self.annotations) self.num_batchs = int(np.ceil(self.num_samples / self.batch_size)) self.batch_count = 0
def compute_loss(pred, conv, label, bboxes, i=0, CLASSES=''): NUM_CLASS = len(read_class_names(CLASSES, dot_name_file=False)) conv_shape = tf.shape(conv) batch_size = conv_shape[0] output_size = conv_shape[1] input_size = STRIDES[i] * output_size conv = tf.reshape(conv, (batch_size, output_size, output_size, 3, 5 + NUM_CLASS)) conv_raw_conf = conv[:, :, :, :, 4:5] conv_raw_prob = conv[:, :, :, :, 5:] pred_xywh = pred[:, :, :, :, 0:4] pred_conf = pred[:, :, :, :, 4:5] label_xywh = label[:, :, :, :, 0:4] respond_bbox = label[:, :, :, :, 4:5] # objectness label_prob = label[:, :, :, :, 5:] giou = tf.expand_dims(bbox_giou(pred_xywh, label_xywh), axis=-1) input_size = tf.cast(input_size, tf.float32) bbox_loss_scale = 2.0 - 1.0 * label_xywh[:, :, :, :, 2:3] * label_xywh[:, :, :, :, 3:4] / ( input_size** 2) giou_loss = respond_bbox * bbox_loss_scale * (1 - giou) iou = bbox_iou(pred_xywh[:, :, :, :, np.newaxis, :], bboxes[:, np.newaxis, np.newaxis, np.newaxis, :, :]) # Find the value of IoU with the real box The largest prediction box max_iou = tf.expand_dims(tf.reduce_max(iou, axis=-1), axis=-1) # If the largest iou is less than the threshold, it is considered that the prediction box contains no objects, then the background box respond_bgd = (1.0 - respond_bbox) * tf.cast( max_iou < YOLO_IOU_LOSS_THRESH, tf.float32) conf_focal = tf.pow(respond_bbox - pred_conf, 2) # Calculate the loss of confidence # we hope that if the grid contains objects, then the network output prediction box has a confidence of 1 and 0 when there is no object. conf_loss = conf_focal * ( respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits( labels=respond_bbox, logits=conv_raw_conf) + respond_bgd * tf.nn.sigmoid_cross_entropy_with_logits( labels=respond_bbox, logits=conv_raw_conf)) prob_loss = respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits( labels=label_prob, logits=conv_raw_prob) giou_loss = tf.reduce_mean(tf.reduce_sum(giou_loss, axis=[1, 2, 3, 4])) conf_loss = tf.reduce_mean(tf.reduce_sum(conf_loss, axis=[1, 2, 3, 4])) prob_loss = tf.reduce_mean(tf.reduce_sum(prob_loss, axis=[1, 2, 3, 4])) return giou_loss, conf_loss, prob_loss
def __init__(self, dataset_type, TRAIN_CLASSES): self.annot_path = TRAIN_ANNOT_PATH if dataset_type == 'train' else TEST_ANNOT_PATH self.input_sizes = TRAIN_INPUT_SIZE if dataset_type == 'train' else TEST_INPUT_SIZE self.batch_size = TRAIN_BATCH_SIZE if dataset_type == 'train' else TEST_BATCH_SIZE self.data_aug = TRAIN_DATA_AUG if dataset_type == 'train' else TEST_DATA_AUG self.train_input_sizes = TRAIN_INPUT_SIZE self.strides = np.array(YOLO_STRIDES) self.classes = read_class_names(TRAIN_CLASSES) self.num_classes = len(self.classes) self.anchors = (np.array(YOLO_ANCHORS).T / self.strides).T self.anchor_per_scale = YOLO_ANCHOR_PER_SCALE self.max_bbox_per_scale = YOLO_MAX_BBOX_PER_SCALE self.annotations = self.load_annotations(dataset_type) self.num_samples = len(self.annotations) self.num_batchs = int(np.ceil(self.num_samples / self.batch_size)) self.batch_count = 0
def Create_Yolov3(input_size=416, channels=3, training=False, CLASSES='', dot_name_file=False): NUM_CLASS = len(read_class_names(CLASSES, dot_name_file=dot_name_file)) input_layer = Input([input_size, input_size, channels]) conv_tensors = YOLOv3(input_layer, NUM_CLASS) output_tensors = [] for i, conv_tensor in enumerate(conv_tensors): pred_tensor = decode(conv_tensor, NUM_CLASS, i) if training: output_tensors.append(conv_tensor) output_tensors.append(pred_tensor) YoloV3 = tf.keras.Model(input_layer, output_tensors) return YoloV3
def main(_argv): # Definition of the parameters max_cosine_distance = 0.5 nn_budget = None nms_max_overlap = 1 # initialize deep sort model_filename = 'deep_sort/feature_extractor/mars-small128.pb' encoder = gdet.create_box_encoder(model_filename, batch_size=1) # calculate cosine distance metric metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) # initialize tracker tracker = Tracker(metric) # configuration of object detector config = ConfigProto() config.gpu_options.allow_growth = True session = InteractiveSession(config=config) STRIDES, ANCHORS, NUM_CLASS, XYSCALE = utils.load_config(FLAGS) input_size = FLAGS.size video_path = FLAGS.video saved_model_loaded = tf.saved_model.load(FLAGS.weights, tags=[tag_constants.SERVING]) infer = saved_model_loaded.signatures['serving_default'] # Capture video try: vid = cv2.VideoCapture(int(video_path)) except: vid = cv2.VideoCapture(video_path) out = None if FLAGS.output: width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) codec = cv2.VideoWriter_fourcc(*FLAGS.output_format) out = cv2.VideoWriter(FLAGS.output, codec, fps, (width, height)) frame_num = 0 # looping over each frame of the video while True: return_value, frame = vid.read() if return_value: frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) image = Image.fromarray(frame) else: print('End!') break frame_num += 1 print('Frame #: ', frame_num) frame_size = frame.shape[:2] image_data = cv2.resize(frame, (input_size, input_size)) image_data = image_data / 255. image_data = image_data[np.newaxis, ...].astype(np.float32) start_time = time.time() # run the detections batch_data = tf.constant(image_data) pred_bbox = infer(batch_data) for key, value in pred_bbox.items(): boxes = value[:, :, 0:4] pred_conf = value[:, :, 4:] boxes, scores, classes, valid_detections = tf.image.combined_non_max_suppression( boxes=tf.reshape(boxes, (tf.shape(boxes)[0], -1, 1, 4)), scores=tf.reshape( pred_conf, (tf.shape(pred_conf)[0], -1, tf.shape(pred_conf)[-1])), max_output_size_per_class=50, max_total_size=50, iou_threshold=FLAGS.iou, score_threshold=FLAGS.score) # convert data to numpy arrays and slice out unused elements num_objects = valid_detections.numpy()[0] bboxes = boxes.numpy()[0] bboxes = bboxes[0:int(num_objects)] scores = scores.numpy()[0] scores = scores[0:int(num_objects)] classes = classes.numpy()[0] classes = classes[0:int(num_objects)] # format bounding boxes from normalized ymin, xmin, ymax, xmax ---> xmin, ymin, width, height original_h, original_w, _ = frame.shape bboxes = utils.format_boxes(bboxes, original_h, original_w) # store all predictions in one parameter for simplicity when calling functions pred_bbox = [bboxes, scores, classes, num_objects] # Class allowed to be tracked class_names = utils.read_class_names(cfg.YOLO.CLASSES) # allowed_classes = list(class_names.values()) allowed_classes = ['person', 'car'] names = [] deleted_indx = [] for i in range(num_objects): class_indx = int(classes[i]) class_name = class_names[class_indx] if class_name not in allowed_classes: deleted_indx.append(i) else: names.append(class_name) names = np.array(names) # remove detections that are not in allowed_classes bboxes = np.delete(bboxes, deleted_indx, axis=0) scores = np.delete(scores, deleted_indx, axis=0) # encode yolo detections and feed to tracker features = encoder(frame, bboxes) detections = [ Detection(bbox, score, class_name, feature) for bbox, score, class_name, feature in zip( bboxes, scores, names, features) ] #initialize color map cmap = plt.get_cmap('tab20b') colors = [cmap(i)[:3] for i in np.linspace(0, 1, 20)] # run non-maxima supression boxs = np.array([d.tlwh for d in detections]) scores = np.array([d.confidence for d in detections]) classes = np.array([d.class_name for d in detections]) indices = preprocessing.non_max_suppression(boxs, classes, nms_max_overlap, scores) detections = [detections[i] for i in indices] # The tracker tracker.predict() tracker.update(detections) # update tracks for track in tracker.tracks: if not track.is_confirmed() or track.time_since_update > 1: continue bbox = track.to_tlbr() class_name = track.get_class() # draw bbox on screen color = colors[int(track.track_id) % len(colors)] color = [i * 255 for i in color] cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), color, 2) cv2.rectangle(frame, (int(bbox[0]), int(bbox[1] - 30)), (int(bbox[0]) + (len(class_name) + len(str(track.track_id))) * 17, int(bbox[1])), color, -1) cv2.putText(frame, class_name + "-" + str(track.track_id), (int(bbox[0]), int(bbox[1] - 10)), 0, 0.75, (255, 255, 255), 2) # calculate frames per second of running detections fps = 1.0 / (time.time() - start_time) print("FPS: %.2f" % fps) result = np.asarray(frame) result = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) if FLAGS.show: cv2.imshow("Output Video", result) # if output flag is set, save video file if FLAGS.output: out.write(result) if cv2.waitKey(1) & 0xFF == ord('q'): break cv2.destroyAllWindows()
#trainset = Dataset('train', NEW_CLASSES_TO_LEARN, CLASSES_TO_EVALUATE) testset = Dataset('test', TEST_CLASSES, TEST_CLASSES) steps_per_epoch =0 for _ in testset: steps_per_epoch+=1 print(steps_per_epoch) yolo = Create_Yolov3(input_size=YOLO_INPUT_SIZE, training= False, CLASSES=CLASSES_TO_EVALUATE, dot_name_file = False) yolo.load_weights('./checkpoints/yolov3_custom_val_loss_ 11.08') num_classes = len(read_class_names(CLASSES_TO_EVALUATE)) CLASSES_NAME = read_class_names(CLASSES_TO_EVALUATE) def evaluate(y_pred, y_true_temp, num_classes, score_thresh=0.4, iou_thresh=0.5): y_true = [y_true_temp[0][0], y_true_temp[1][0], y_true_temp[2][0]] num_images = y_true[0].shape[0] true_labels_dict = {i:0 for i in range(num_classes)} # {class: count} pred_labels_dict = {i:0 for i in range(num_classes)} true_positive_dict = {i:0 for i in range(num_classes)} for i in range(num_images):
def preprocess_true_boxes(self, bboxes): EXPANDED_CLASSES_NAME = read_class_names( self.TOTAL_CLASSES_WILL_KNOW_AFTER_THIS, dot_name_file=False) VOC_CLASSES_NAME = list( read_class_names(PASCAL_VOC_ALL_CLASSES, dot_name_file=False)) label = [ np.zeros((self.train_output_sizes[i], self.train_output_sizes[i], self.anchor_per_scale, 5 + self.num_classes)) for i in range(3) ] bboxes_xywh = [ np.zeros((self.max_bbox_per_scale, 4)) for _ in range(3) ] bbox_count = np.zeros((3, )) for bbox in bboxes: bbox_coor = bbox[:4] bbox_class_ind = bbox[4] if bbox_class_ind in self.new_classes: onehot = np.zeros(self.num_classes, dtype=np.float) bbox_class_ind_new = EXPANDED_CLASSES_NAME.index( VOC_CLASSES_NAME[bbox_class_ind]) onehot[bbox_class_ind_new] = 1.0 uniform_distribution = np.full(self.num_classes, 1.0 / self.num_classes) deta = 0.01 smooth_onehot = onehot * (1 - deta) + deta * uniform_distribution bbox_xywh = np.concatenate( [(bbox_coor[2:] + bbox_coor[:2]) * 0.5, bbox_coor[2:] - bbox_coor[:2]], axis=-1) bbox_xywh_scaled = 1.0 * bbox_xywh[ np.newaxis, :] / self.strides[:, np.newaxis] iou = [] exist_positive = False for i in range(3): anchors_xywh = np.zeros((self.anchor_per_scale, 4)) anchors_xywh[:, 0:2] = np.floor( bbox_xywh_scaled[i, 0:2]).astype(np.int32) + 0.5 anchors_xywh[:, 2:4] = self.anchors[i] iou_scale = bbox_iou(bbox_xywh_scaled[i][np.newaxis, :], anchors_xywh) iou.append(iou_scale) iou_mask = iou_scale > 0.3 if np.any(iou_mask): xind, yind = np.floor(bbox_xywh_scaled[i, 0:2]).astype( np.int32) label[i][yind, xind, iou_mask, :] = 0 label[i][yind, xind, iou_mask, 0:4] = bbox_xywh label[i][yind, xind, iou_mask, 4:5] = 1.0 label[i][yind, xind, iou_mask, 5:] = smooth_onehot bbox_ind = int(bbox_count[i] % self.max_bbox_per_scale) bboxes_xywh[i][bbox_ind, :4] = bbox_xywh bbox_count[i] += 1 exist_positive = True if not exist_positive: best_anchor_ind = np.argmax(np.array(iou).reshape(-1), axis=-1) best_detect = int(best_anchor_ind / self.anchor_per_scale) best_anchor = int(best_anchor_ind % self.anchor_per_scale) xind, yind = np.floor( bbox_xywh_scaled[best_detect, 0:2]).astype(np.int32) label[best_detect][yind, xind, best_anchor, :] = 0 label[best_detect][yind, xind, best_anchor, 0:4] = bbox_xywh label[best_detect][yind, xind, best_anchor, 4:5] = 1.0 label[best_detect][yind, xind, best_anchor, 5:] = smooth_onehot bbox_ind = int(bbox_count[best_detect] % self.max_bbox_per_scale) bboxes_xywh[best_detect][bbox_ind, :4] = bbox_xywh bbox_count[best_detect] += 1 label_sbbox, label_mbbox, label_lbbox = label sbboxes, mbboxes, lbboxes = bboxes_xywh return label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes
import numpy as np import tensorflow as tf from yolo.yolov3 import Create_Yolov3 from yolo.utils import image_preprocess, postprocess_boxes, nms, read_class_names from yolo.configs import * annot_path = './model_data/pascal_voc07_test.txt' CURRENT_KNOWN_OBJECTS = EVALUATION_CLASSES = './model_data/classes_to_evaluate.txt' PASCAL_VOC_ALL_CLASSES = './model_data/pascal_voc07_cls_names.txt' dt = './dt/' iou_threshold = 0.5 score_threshold = 0.3 input_size = YOLO_INPUT_SIZE CURRENT_KNOWN_OBJECTS_NAME = read_class_names(EVALUATION_CLASSES, dot_name_file=False) PASCAL_VOC_ALL_CLASSES_NAME = read_class_names(PASCAL_VOC_ALL_CLASSES) yolo = Create_Yolov3(input_size=input_size, CLASSES=CURRENT_KNOWN_OBJECTS) yolo.load_weights('./checkpoints/yolov3_custom_val_loss_ 808.03') def load_annotations(annot_path): final_annotations = [] with open(annot_path, 'r') as f: txt = f.readlines() annotations = [ line.strip() for line in txt if len(line.strip().split()[1:]) != 0 ] np.random.shuffle(annotations)
def compute_loss(pred, conv, label, bboxes, i=0, CLASSES='', PRED_PREV='', CLASSES_PREV=''): NUM_CLASS = len(read_class_names(CLASSES, dot_name_file=False)) NUM_CLASS_PREV = len(read_class_names(CLASSES_PREV, dot_name_file=False)) conv_shape = tf.shape(conv) batch_size = conv_shape[0] output_size = conv_shape[1] input_size = STRIDES[i] * output_size conv = tf.reshape(conv, (batch_size, output_size, output_size, 3, 5 + NUM_CLASS)) extender = tf.zeros((batch_size, output_size, output_size, 3, (NUM_CLASS - NUM_CLASS_PREV)), tf.float32) + 0.001 PRED_PREV = tf.concat([PRED_PREV, extender], axis=-1) prev_confidence = PRED_PREV[:, :, :, :, 4:5] present_confidence = tf.sigmoid(conv[:, :, :, :, 4:5]) prev_respond_bgd1 = tf.cast(prev_confidence > 0.007, tf.float32) #prev_respond_bgd2 = tf.cast(present_confidence > 0.75 , tf.float32) prev_respond_bgd = prev_respond_bgd1 # + prev_respond_bgd2 conv_raw_conf = conv[:, :, :, :, 4:5] conv_raw_prob = conv[:, :, :, :, 5:] pred_xywh = pred[:, :, :, :, 0:4] pred_conf = pred[:, :, :, :, 4:5] label_xywh = label[:, :, :, :, 0:4] respond_bbox = label[:, :, :, :, 4:5] # objectness label_prob = label[:, :, :, :, 5:] giou = tf.expand_dims(bbox_giou(pred_xywh, label_xywh), axis=-1) input_size = tf.cast(input_size, tf.float32) bbox_loss_scale = 2.0 - 1.0 * label_xywh[:, :, :, :, 2:3] * label_xywh[:, :, :, :, 3:4] / ( input_size** 2) giou_loss = respond_bbox * bbox_loss_scale * (1 - giou) iou = bbox_iou(pred_xywh[:, :, :, :, np.newaxis, :], bboxes[:, np.newaxis, np.newaxis, np.newaxis, :, :]) # Find the value of IoU with the real box The largest prediction box max_iou = tf.expand_dims(tf.reduce_max(iou, axis=-1), axis=-1) # If the largest iou is less than the threshold, it is considered that the prediction box contains no objects, then the background box respond_bgd = (1.0 - respond_bbox) * tf.cast( max_iou < YOLO_IOU_LOSS_THRESH, tf.float32) conf_focal = tf.pow(respond_bbox - pred_conf, 2) # previous prediction background # # Calculate the loss of confidence # we hope that if the grid contains objects, then the network output prediction box has a confidence of 1 and 0 when there is no object. conf_loss = conf_focal * ( respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits( labels=respond_bbox, logits=conv_raw_conf) + respond_bgd * (1 - prev_respond_bgd) * tf.nn.sigmoid_cross_entropy_with_logits( labels=respond_bbox, logits=conv_raw_conf)) distilation_loss_reg = tf.math.abs( respond_bgd * prev_respond_bgd * (tf.subtract(PRED_PREV[:, :, :, :, 0:4], pred[:, :, :, :, 0:4]))) distilation_loss_conf = respond_bgd * ( prev_respond_bgd) * tf.nn.sigmoid_cross_entropy_with_logits( labels=PRED_PREV[:, :, :, :, 4:5], logits=conv[:, :, :, :, 4:5]) distilation_loss_prob = respond_bgd * ( prev_respond_bgd) * tf.nn.sigmoid_cross_entropy_with_logits( labels=PRED_PREV[:, :, :, :, 5:], logits=conv[:, :, :, :, 5:]) prob_loss = respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits( labels=label_prob, logits=conv_raw_prob) giou_loss = tf.reduce_mean(tf.reduce_sum(giou_loss, axis=[1, 2, 3, 4])) conf_loss = tf.reduce_mean(tf.reduce_sum(conf_loss, axis=[1, 2, 3, 4])) prob_loss = tf.reduce_mean(tf.reduce_sum(prob_loss, axis=[1, 2, 3, 4])) #################### distilation_loss_conf = tf.reduce_mean( tf.reduce_sum(distilation_loss_conf, axis=[1, 2, 3, 4])) distilation_loss_prob = tf.reduce_mean( tf.reduce_sum(distilation_loss_prob, axis=[1, 2, 3, 4])) distilation_loss_reg = tf.reduce_mean( tf.reduce_sum(distilation_loss_reg, axis=[1, 2, 3, 4])) distilation_loss = distilation_loss_prob + distilation_loss_reg * 0.02 + distilation_loss_conf return giou_loss, conf_loss, prob_loss, distilation_loss