def train_hs512(lr=1e-4, freeze_bn=False, optim=None, batch_size=8, weights_path=None, save_weights_only=True, epochs=25): if weights_path is None: weights_path = 'VGG_VOC0712Plus_SSD_512x512_ft_iter_160000.h5' img_height = 512 # 1088 // 2 # Height of the input images img_width = 512 # 2048 // 2 # Width of the input images img_channels = 3 # Number of color channels of the input images # subtract_mean = [104, 117, 123] # The per-channel mean of the images in the dataset subtract_mean = [138, 138, 138] # The per-channel mean of the images in the dataset swap_channels = False n_classes = 20 # The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO. scales = [0.04, 0.1, 0.26, 0.42, 0.58, 0.74, 0.9, 1.06] # MS COCO scales # scales = [0.07, 0.15, 0.3, 0.45, 0.6, 0.75, 0.9, 1.05] aspect_ratios = [ [1.0, 2.0, 0.5], [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0], [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0], [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0], [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0 ], [1.0, 2.0, 0.5 ], [1.0, 2.0, 0.5] ] # The anchor box aspect ratios used in the original SSD300; the order matters two_boxes_for_ar1 = True # The space between two adjacent anchor box center points for each predictor layer. steps = [8, 16, 32, 64, 128, 256, 512] offsets = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5] limit_boxes = False # Whether or not you want to limit the anchor boxes to lie entirely within the image boundaries variances = [0.1, 0.1, 0.2, 0.2] coords = 'centroids' normalize_coords = True # 1: Build the Keras model K.clear_session() model, pred_sizes = ssd_512(image_size=(img_height, img_width, img_channels), n_classes=n_classes, l2_regularization=0.0005, scales=scales, aspect_ratios_per_layer=aspect_ratios, two_boxes_for_ar1=two_boxes_for_ar1, steps=steps, offsets=offsets, limit_boxes=limit_boxes, variances=variances, coords=coords, normalize_coords=normalize_coords, subtract_mean=None, divide_by_stddev=None, swap_channels=swap_channels, return_predictor_sizes=True) # 2: Load the trained VGG-16 weights into the model. model.load_weights(weights_path, by_name=True) # 3: Instantiate the optimizer and the SSD loss function and compile the model if optim is None: optim = SGD(lr=lr, momentum=0.9) ssd_loss = SSDLoss(neg_pos_ratio=3, n_neg_min=0, alpha=1.0) # 4: freeze the base model if needed if freeze_bn: for l in model.layers[:38]: l.trainable = False # 5: compile model model.compile(optimizer=optim, loss=ssd_loss.compute_loss) ## Prepare data generation # 1: Instantiate to `BatchGenerator` objects: One for training, one for validation. train_dataset = BatchGenerator( box_output_format=['class_id', 'xmin', 'ymin', 'xmax', 'ymax']) val_dataset = BatchGenerator( box_output_format=['class_id', 'xmin', 'ymin', 'xmax', 'ymax']) # 2: Parse the image and label lists for the training and validation datasets. This can take a while. images_path_root = './Datasets/' train_combined_labels = './Datasets/train_combined_ssd_512.txt' val_labels = './Datasets/val_ssd_512.txt' train_dataset.parse_csv(images_dir=images_path_root, labels_filename=train_combined_labels, input_format=[ 'image_name', 'class_id', 'xmin', 'xmax', 'ymin', 'ymax' ]) val_dataset.parse_csv(images_dir=images_path_root, labels_filename=val_labels, input_format=[ 'image_name', 'class_id', 'xmin', 'xmax', 'ymin', 'ymax' ]) # 3: Instantiate an encoder that can encode ground truth labels into the format needed by the SSD loss function. ssd_box_encoder = SSDBoxEncoder(img_height=img_height, img_width=img_width, n_classes=n_classes, predictor_sizes=pred_sizes, min_scale=None, max_scale=None, scales=scales, aspect_ratios_global=None, aspect_ratios_per_layer=aspect_ratios, two_boxes_for_ar1=two_boxes_for_ar1, steps=steps, offsets=offsets, limit_boxes=limit_boxes, variances=variances, pos_iou_threshold=0.5, neg_iou_threshold=0.2, coords=coords, normalize_coords=normalize_coords) # 4: Set the image processing / data augmentation options and create generator handles. train_generator = train_dataset.generate( batch_size=batch_size, shuffle=True, train=True, ssd_box_encoder=ssd_box_encoder, equalize=False, brightness=(0.5, 2, 0.5), flip=0.5, translate=False, scale=False, max_crop_and_resize=(img_height, img_width, 1, 3), # This one is important because the Pascal VOC images vary in size random_pad_and_resize=(img_height, img_width, 1, 3, 0.5), # This one is important because the Pascal VOC images vary in size random_crop=False, crop=False, resize=False, gray=True, limit_boxes=True, # While the anchor boxes are not being clipped, the ground truth boxes should be include_thresh=0.4) val_generator = val_dataset.generate( batch_size=batch_size, shuffle=True, train=True, ssd_box_encoder=ssd_box_encoder, equalize=False, brightness=False, flip=False, translate=False, scale=False, max_crop_and_resize=(img_height, img_width, 1, 3), # This one is important because the Pascal VOC images vary in size random_pad_and_resize=(img_height, img_width, 1, 3, 0.5), # This one is important because the Pascal VOC images vary in size random_crop=False, crop=False, resize=False, gray=True, limit_boxes=True, include_thresh=0.4) # Get the number of samples in the training and validations datasets to compute the epoch lengths below. n_train_samples = train_dataset.get_n_samples() n_val_samples = val_dataset.get_n_samples() # ## 4. Run the training fingerprint = 'ssd{:%Y-%m-%d_%H-%M-%S}'.format(datetime.datetime.now()) tbCallBack = TensorBoard(log_dir='./Graph/{}'.format(fingerprint), histogram_freq=0, batch_size=batch_size, write_graph=True) checkpointer = ModelCheckpoint( './saved/{{val_loss:.2f}}__{}_best_weights.h5'.format(fingerprint), monitor='val_loss', verbose=1, save_best_only=save_weights_only, save_weights_only=True, mode='auto', period=1) learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', patience=5, verbose=1, factor=0.5, min_lr=1e-6) stopper = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10) epochs = 30 history = model.fit_generator( generator=train_generator, steps_per_epoch=ceil(n_train_samples / batch_size), epochs=epochs, callbacks=[checkpointer, learning_rate_reduction, stopper, tbCallBack], validation_data=val_generator, validation_steps=ceil(n_val_samples / batch_size))
dense_layer_size = 512 # Normalization parameters. mean = 0.408808 std = 0.237583 t_mean = -0.041212 t_std = 0.323931 p_mean = -0.000276 p_std = 0.540958 # Models. head_detector = ssd_512(image_size=(in_size_detector, in_size_detector, 3), n_classes=1, min_scale=0.1, max_scale=1, mode='inference') head_detector.load_weights(detector_path) pose_estimator = mpatacchiola_generic(in_size_estimator, num_conv_blocks, num_filters_start, num_dense_layers, dense_layer_size) pose_estimator.load_weights(estimator_path) # Get video source. video_source = input("Input stream: ") # If video source is a device convert the string identifying it to integer. try: video_source = int(video_source) except: pass
def main(): ''' This function acts as a testbench for the function clean_pointing04, using it to perform the basic processing of the Pointing'04 dataset from a set of default values defined below. ''' # Source paths. pointing04_dir = 'original/HeadPoseImageDatabase/' # Destination path. destination_dir = 'clean/pointing04/' # Detector model path. head_detector_path = 'models/head-detector.h5' # Detection parameters. in_size = 512 out_size = 64 confidence_threshold = 0.75 # Output parameters. grayscale_output = True downscaling_interpolation = cv2.INTER_LINEAR # Number of splits for class assignation. num_splits_tilt = 8 num_splits_pan = 8 # Ratios for train/test and train/validation split. test_ratio = 0.2 validation_ratio = 0.2 # Detector model. detector = ssd_512(image_size=(in_size, in_size, 3), n_classes=1, min_scale=0.1, max_scale=1, mode='inference') detector.load_weights(head_detector_path) # Check if output directory exists. try: os.mkdir(destination_dir) print("Directory", destination_dir, "created.") except FileExistsError: print("Directory", destination_dir, "already exists.") shutil.rmtree(destination_dir) os.mkdir(destination_dir) # Actual cleaning. clean_pointing04(pointing04_dir, destination_dir, detector, confidence_threshold, out_size, grayscale_output, downscaling_interpolation) # Assign classes. class_assign(destination_dir, num_splits_tilt, num_splits_pan) # Split dataset. split_dataset(destination_dir, test_ratio, validation_ratio) # Get normalization parameters. find_norm_parameters(destination_dir) # OPTIONAL: Save dataset as numpy arrays (for uploading to Google Colab). store_dataset_arrays(destination_dir)
clip_boxes = False # Whether or not to clip the anchor boxes to lie entirely within the image boundaries variances = [ 0.1, 0.1, 0.2, 0.2 ] # The variances by which the encoded target coordinates are divided as in the original implementation normalize_coords = True # 1: Build the Keras model K.clear_session() # Clear previous models from memory. model = ssd_512(image_size=(img_height, img_width, img_channels), n_classes=n_classes, mode='training', l2_regularization=0.0005, scales=scales, aspect_ratios_per_layer=aspect_ratios, two_boxes_for_ar1=True, steps=steps, offsets=offsets, clip_boxes=False, variances=variances, normalize_coords=normalize_coords, subtract_mean=mean_color, swap_channels=swap_channels) # TODO: Scales ? # TODO: aspect_ratios_per_layer ? # TODO: two_boxes_for_ar1 ? # TODO: steps ? # TODO: offsets ? # TODO: clip_boxes ? # TODO: variances ? # TODO: normalize_coords ? # TODO: subtract_mean ?
def main(): rospy.init_node('new_detect_pkg', anonymous=True) ic = image_converter() icp = image_converter_pointcloud() r = rospy.Rate(50) rospy.sleep(0.5) publisher() print('---------initialization model...please wait----------') # ssd_entity = SSD_entity() img_height = 512 # 272 Height of the input images img_width = 512 # 480 Width of the input images img_channels = 3 # Number of color channels of the input images # 1: Build the Keras model K.clear_session() # Clear previous models from memory. model = ssd_512( image_size=(img_height, img_width, 3), n_classes=20, mode='inference', l2_regularization=0.0005, scales=[ 0.07, 0.15, 0.3, 0.45, 0.6, 0.75, 0.9, 1.05 ], # The scales for MS COCO are [0.04, 0.1, 0.26, 0.42, 0.58, 0.74, 0.9, 1.06] aspect_ratios_per_layer=[[1.0, 2.0, 0.5], [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0], [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0], [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0], [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0], [1.0, 2.0, 0.5], [1.0, 2.0, 0.5]], two_boxes_for_ar1=True, steps=[8, 16, 32, 64, 128, 256, 512], offsets=[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5], clip_boxes=False, variances=[0.1, 0.1, 0.2, 0.2], normalize_coords=True, subtract_mean=[123, 117, 104], swap_channels=[2, 1, 0], confidence_thresh=0.5, iou_threshold=0.45, top_k=200, nms_max_output_size=400) print("Model built.") # 2: Load the sub-sampled weights into the model. # Load the weights that we've just created via sub-sampling. # model.load_weights('/home/ogai1234/catkin_ws/src/detect_pkg/bin/ssd300_weights_epoch-45_loss-4.3010_val_loss-4.3788.h5', by_name=True) # model.load_weights('/home/ogai1234/catkin_ws/src/detect_pkg/bin/VGG_VOC0712Plus_SSD_300x300_ft_iter_160000.h5', by_name=True) model.load_weights( '/home/ogai1234/catkin_ws/src/detect_pkg/bin/VGG_VOC0712Plus_SSD_512x512_ft_iter_160000.h5', by_name=True) print('---------model done----------') # model.load_weights(weights_path, by_name=True) # print("Weights file loaded:", weights_path) # classes = ["background", "dog", "umbrellaman", "cone", "car", "bicycle", "person"] classes = [ 'background', 'Aeroplane', 'Bicycle', 'Bird', 'Boat', 'Bottle', 'Bus', 'Car', 'Cat', 'Chair', 'Cow', 'Diningtable', 'Dog', 'Horse', 'Motorbike', 'Person', 'Pottedplant', 'Sheep', 'Sofa', 'Train', 'Tvmonitor' ] key = '' t0 = time.time() frame_code = 1 linewidth = 3 figsize = (10, 10) # initilize the filter cach global record_xmin, record_xmax, record_ymin, record_ymax, record_cls_id, record_obj_id global record_obj_num_appeared, total_objPerFrame record_obj_num_appeared = 0 record_xmin = [[0 for i in range(11)] for i in range(2)] record_xmax = [[0 for i in range(11)] for i in range(2)] record_ymin = [[0 for i in range(11)] for i in range(2)] record_ymax = [[0 for i in range(11)] for i in range(2)] record_cls_id = [[0 for i in range(11)] for i in range(2)] record_obj_id = [[0 for i in range(11)] for i in range(2)] global objPerFrame objPerFrame = 0 total_objPerFrame = 0 font = cv2.FONT_HERSHEY_TRIPLEX while (key != 113) and (not rospy.is_shutdown()): t1 = time.time() #one frame start frame_code = frame_code + 1 print("_________________________Frame:", frame_code) # for ros topic publish global objPerlastFrame, type_code_list, confidence_list, distance_list, x_list, y_list, z_list objPerlastFrame = total_objPerFrame # calculate the num of object appeared in one frame objPerFrame = 0 total_objPerFrame = 0 # Ros message content initial type_code_list = [0 for i in range(11)] confidence_list = [0 for i in range(11)] distance_list = [0 for i in range(11)] x_list = [0 for i in range(11)] y_list = [0 for i in range(11)] z_list = [0 for i in range(11)] # load zed image from ros topic image = ic.zed_image # image.shape = (720, 1280, 3) frame = image frame_old = frame # frame = frame[:,:,0:3] frame = cv2.resize(frame, (img_width, img_height)) frame_np = np.array(frame) frame_np = frame_np[np.newaxis, :, :, :] # put frame into SSD network to do classification y_pred = model.predict(frame_np) confidence_threshold = 0.5 y_pred_thresh = [ y_pred[k][y_pred[k, :, 1] > confidence_threshold] for k in range(y_pred.shape[0]) ] colors = dict() for box in y_pred_thresh[0]: if objPerFrame > 9: break cls_id = int(box[0]) if cls_id not in colors: colors[cls_id] = (random.random(), random.random(), random.random()) # for person and car/train/bus, collect its information and transfer to ROS if cls_id == 15: #Person score = box[1] xmin = int(box[2] * frame_old.shape[1] / img_width) ymin = int(box[3] * frame_old.shape[0] / img_height) xmax = int(box[4] * frame_old.shape[1] / img_width) ymax = int(box[5] * frame_old.shape[0] / img_height) cv2.rectangle(frame_old, (xmin, ymin), (xmax, ymax), color=colors[cls_id], thickness=linewidth) # class_name = str(cls_id) # calculate distance and position x_center = round((xmin + xmax) / 2) y_center = round((ymin + ymax) / 2) icp.dict_1 = x_center icp.dict_2 = y_center rospy.sleep(0.04) point_cloud = icp.zed_image_pointcloud x, y, z = 0, 0, 0 #x,y,z is the position obtained from pointcloud2 for p in point_cloud: x, y, z = p break distance = math.sqrt(x * x + y * y + z * z) type_code = cls_id class_name = classes[cls_id] # Person's typecode is 1 type_code = 1 total_objPerFrame = total_objPerFrame + 1 appeared = frameFilter(cls_id, xmin, xmax, ymin, ymax) if (appeared == 1): objRecord2Ros(class_name, score, cls_id, type_code, distance, x, y, z) cv2.putText(frame_old, '{:s} | {:.2f} | {:.2f} |{:}'.format( class_name, score, distance, record_obj_id[1][total_objPerFrame]), (xmin, ymin + 2), font, 0.5, (255, 255, 255), thickness=1) if cls_id == 7: #Car score = box[1] xmin = int(box[2] * frame_old.shape[1] / img_width) ymin = int(box[3] * frame_old.shape[0] / img_height) xmax = int(box[4] * frame_old.shape[1] / img_width) ymax = int(box[5] * frame_old.shape[0] / img_height) cv2.rectangle(frame_old, (xmin, ymin), (xmax, ymax), color=colors[cls_id], thickness=linewidth) # class_name = str(cls_id) # calculate distance and position x_center = round((xmin + xmax) / 2) y_center = round((ymin + ymax) / 2) icp.dict_1 = x_center icp.dict_2 = y_center rospy.sleep(0.04) point_cloud = icp.zed_image_pointcloud x, y, z = 0, 0, 0 #x,y,z is the position obtained from pointcloud2 for p in point_cloud: x, y, z = p break distance = math.sqrt(x * x + y * y + z * z) type_code = cls_id class_name = classes[cls_id] # Car's typecode is 4 type_code = 2 total_objPerFrame = total_objPerFrame + 1 appeared = frameFilter(cls_id, xmin, xmax, ymin, ymax) if (appeared == 1): objRecord2Ros(class_name, score, cls_id, type_code, distance, x, y, z) cv2.putText(frame_old, '{:s} | {:.2f} |{:}'.format( class_name, distance, record_obj_id[1][total_objPerFrame]), (xmin, ymin + 2), font, 0.5, (255, 255, 255), thickness=1) if cls_id == 6: #Bus score = box[1] xmin = int(box[2] * frame_old.shape[1] / img_width) ymin = int(box[3] * frame_old.shape[0] / img_height) xmax = int(box[4] * frame_old.shape[1] / img_width) ymax = int(box[5] * frame_old.shape[0] / img_height) cv2.rectangle(frame_old, (xmin, ymin), (xmax, ymax), color=colors[cls_id], thickness=linewidth) # class_name = str(cls_id) # calculate distance and position x_center = round((xmin + xmax) / 2) y_center = round((ymin + ymax) / 2) icp.dict_1 = x_center icp.dict_2 = y_center rospy.sleep(0.04) point_cloud = icp.zed_image_pointcloud x, y, z = 0, 0, 0 #x,y,z is the position obtained from pointcloud2 for p in point_cloud: x, y, z = p break distance = math.sqrt(x * x + y * y + z * z) type_code = cls_id class_name = classes[cls_id] # Car's typecode is 4 type_code = 2 total_objPerFrame = total_objPerFrame + 1 appeared = frameFilter(cls_id, xmin, xmax, ymin, ymax) if (appeared == 1): objRecord2Ros(class_name, score, cls_id, type_code, distance, x, y, z) cv2.putText(frame_old, '{:s} | {:.2f} |{:}'.format( class_name, distance, record_obj_id[1][total_objPerFrame]), (xmin, ymin + 2), font, 0.5, (255, 255, 255), thickness=1) if cls_id == 19: #Train score = box[1] xmin = int(box[2] * frame_old.shape[1] / img_width) ymin = int(box[3] * frame_old.shape[0] / img_height) xmax = int(box[4] * frame_old.shape[1] / img_width) ymax = int(box[5] * frame_old.shape[0] / img_height) cv2.rectangle(frame_old, (xmin, ymin), (xmax, ymax), color=colors[cls_id], thickness=linewidth) # class_name = str(cls_id) # calculate distance and position x_center = round((xmin + xmax) / 2) y_center = round((ymin + ymax) / 2) icp.dict_1 = x_center icp.dict_2 = y_center rospy.sleep(0.04) point_cloud = icp.zed_image_pointcloud x, y, z = 0, 0, 0 #x,y,z is the position obtained from pointcloud2 for p in point_cloud: x, y, z = p break distance = math.sqrt(x * x + y * y + z * z) type_code = cls_id class_name = classes[cls_id] # Car's typecode is 4 type_code = 4 total_objPerFrame = total_objPerFrame + 1 appeared = frameFilter(cls_id, xmin, xmax, ymin, ymax) if (appeared == 1): objRecord2Ros(class_name, score, cls_id, type_code, distance, x, y, z) cv2.putText(frame_old, '{:s} | {:.2f} |{:}'.format( class_name, distance, record_obj_id[1][total_objPerFrame]), (xmin, ymin + 2), font, 0.5, (255, 255, 255), thickness=1) # if want show all classes, uncomment this line , and comment the upper 2 ifs # objRecord2Ros(class_name,score,cls_id,type_code,distance,x,y,z) # cv2.putText(frame, '{:s} | {:.2f} |{:}'.format(class_name,distance,record_obj_id[1][total_objPerFrame]), (xmin, ymin+2), font, 0.5, (255, 255, 255), thickness=1) # draw the bounding box # print format: class|conf|distance|x|y # show the image with bounding box # image_back = cv2.resize(frame, (1080,720)) cv2.imshow("image_back", frame_old) key = cv2.waitKey(1) t21 = time.time() # calculate fps print('fps {:f}'.format(1 / (t21 - t1))) talker() # last frame info move forward for i in range(10): # print('record_xmin[0][i]:',record_xmin[0][i+1]) # print('record_xmin[1][i]:',record_xmin[1][i+1]) record_xmin[0][i + 1] = record_xmin[1][i + 1] record_xmax[0][i + 1] = record_xmax[1][i + 1] record_ymin[0][i + 1] = record_ymin[1][i + 1] record_ymax[0][i + 1] = record_ymax[1][i + 1] record_cls_id[0][i + 1] = record_cls_id[1][i + 1] record_obj_id[0][i + 1] = record_obj_id[1][i + 1] record_xmin[1][i + 1] = 0 record_xmax[1][i + 1] = 0 record_ymin[1][i + 1] = 0 record_ymax[1][i + 1] = 0 record_cls_id[1][i + 1] = 0 record_obj_id[1][i + 1] = 0 if record_obj_num_appeared > 10000: record_obj_num_appeared = 0
model_dir = 'models/' model_name = 'head-detector' matconvnet_model_path = model_dir + model_name + '.mat' keras_model_path = model_dir + model_name + '.h5' layer_mapping_path = model_dir + 'layers.csv' # Model parameters. img_height = 512 img_width = 512 # Model instance. model = ssd_512(image_size=(img_height, img_width, 3), n_classes=1, min_scale=0.1, max_scale=1, mode='inference') # Load weights. model_ori = loadmat(matconvnet_model_path) # Load layer mapping between MatConvNet and Keras models. layers = pd.read_csv(layer_mapping_path) # For each entry in the mapping, set its weights and biases from the original model: for index, layer in layers.iterrows(): print("Assigning layer " + layer['name'] + "...")