def feature_flow(): bbox_util = BBoxUtility(NUM_CLASSES) raw_inputs, images = load_inputs(image_files) inputs = preprocess_input(np.array(raw_inputs)) dump_activation_layer = 'conv4_2' compare_layer_name = 'conv6_2' print('dump_activation_layer', dump_activation_layer) print('target_layer_name', compare_layer_name) # normal SSD network model1 = SSD300v2(input_shape, num_classes=NUM_CLASSES) model1.load_weights('weights_SSD300.hdf5', by_name=True) predictions = run_network(model1, inputs) results = bbox_util.detection_out(predictions) plot_detections(images, results) # get dump layer's output (as input for flow network) input_img2 = inputs[1:2, :, :, :] layer_dump = get_layer_output(model=model1, inputs=input_img2, output_layer_name=dump_activation_layer) print('layer_dump.shape = ', layer_dump.shape) # flow (raw rgb) flow_rgb = compute_flow(image_files[1], image_files[0]) print('flow.shape', flow_rgb.shape) imshow_fig(cv2.cvtColor(draw_hsv(flow_rgb), cv2.COLOR_BGR2RGB), title='flow_rgb') # flow (re-sized for feature map) flow_feature = get_flow_for_filter(flow_rgb) # imshow_fig(flow_feature[:, :, 0], title='flow_feature_y', cmap='gray') # imshow_fig(flow_feature[:, :, 1], title='flow_feature_x', cmap='gray') # warp image by flow_rgb iimg1 = cv2.imread(image_files[0]) img_warp = warp_flow(iimg1, flow_rgb) imshow_fig(cv2.cvtColor(img_warp, cv2.COLOR_BGR2RGB), title='frame_2_warp') # shift feature shifted_feature = shift_filter(layer_dump, flow_feature) # flow net model2 = SSD300_conv4_3((128, 128, 512), num_classes=NUM_CLASSES) model2.load_weights('weights_SSD300.hdf5', by_name=True) predictions = run_network(model2, shifted_feature) results = bbox_util.detection_out(predictions) plot_detections(images[1:2], results) # get specific layer's output and compare them (for debugging) compare_model_layer(model1, input_img2, compare_layer_name, model2, shifted_feature, compare_layer_name, True) sess.close() plt.show()
def main(img_paths): """ Detect objects in images. Parameters ---------- img_paths : list of strings """ # Load the model voc_classes = ['Aeroplane', 'Bicycle', 'Bird', 'Boat', 'Bottle', 'Bus', 'Car', 'Cat', 'Chair', 'Cow', 'Diningtable', 'Dog', 'Horse', 'Motorbike', 'Person', 'Pottedplant', 'Sheep', 'Sofa', 'Train', 'Tvmonitor'] NUM_CLASSES = len(voc_classes) + 1 input_shape = (300, 300, 3) model = SSD300(input_shape, num_classes=NUM_CLASSES) model.load_weights('weights_SSD300.hdf5', by_name=True) bbox_util = BBoxUtility(NUM_CLASSES) # Load the inputs inputs = [] images = [] for img_path in img_paths: img = image.load_img(img_path, target_size=(300, 300)) img = image.img_to_array(img) images.append(imread(img_path)) inputs.append(img.copy()) inputs = preprocess_input(np.array(inputs)) # Predict preds = model.predict(inputs, batch_size=1, verbose=1) results = bbox_util.detection_out(preds) # Visualize for i, img in enumerate(images): create_overlay(img, results[i], voc_classes, "{}-det.png".format(img_paths[i]))
class VideoTest(object): """ Class for testing a trained SSD model on a video file and show the result in a window. Class is designed so that one VideoTest object can be created for a model, and the same object can then be used on multiple videos and webcams. Arguments: class_names: A list of strings, each containing the name of a class. The first name should be that of the background class which is not used. model: An SSD model. It should already be trained for images similar to the video to test on. input_shape: The shape that the model expects for its input, as a tuple, for example (300, 300, 3) bbox_util: An instance of the BBoxUtility class in ssd_utils.py The BBoxUtility needs to be instantiated with the same number of classes as the length of class_names. """ def __init__(self, class_names, model, input_shape): self.class_names = class_names self.num_classes = len(class_names) self.model = model self.input_shape = input_shape self.bbox_util = BBoxUtility(self.num_classes) # Create unique and somewhat visually distinguishable bright # colors for the different classes. self.class_colors = [] for i in range(0, self.num_classes): # This can probably be written in a more elegant manner hue = 255*i/self.num_classes col = np.zeros((1,1,3)).astype("uint8") col[0][0][0] = hue col[0][0][1] = 128 # Saturation col[0][0][2] = 255 # Value cvcol = cv2.cvtColor(col, cv2.COLOR_HSV2BGR) col = (int(cvcol[0][0][0]), int(cvcol[0][0][1]), int(cvcol[0][0][2])) self.class_colors.append(col) def run(self, video_path = 0, start_frame = 0, conf_thresh = 0.6): """ Runs the test on a video (or webcam) # Arguments video_path: A file path to a video to be tested on. Can also be a number, in which case the webcam with the same number (i.e. 0) is used instead start_frame: The number of the first frame of the video to be processed by the network. conf_thresh: Threshold of confidence. Any boxes with lower confidence are not visualized. """ vid = cv2.VideoCapture(video_path) if not vid.isOpened(): raise IOError(("Couldn't open video file or webcam. If you're " "trying to open a webcam, make sure you video_path is an integer!")) # Compute aspect ratio of video vidw = vid.get(cv2.CAP_PROP_FRAME_WIDTH) vidh = vid.get(cv2.CAP_PROP_FRAME_HEIGHT) vidar = vidw/vidh # Skip frames until reaching start_frame if start_frame > 0: vid.set(cv2.cv.CV_CAP_PROP_POS_MSEC, start_frame) accum_time = 0 curr_fps = 0 fps = "FPS: ??" prev_time = timer() num_frame=0 while True: retval, orig_image = vid.read() if not retval: print("Done!") return im_size = (self.input_shape[0], self.input_shape[1]) resized = cv2.resize(orig_image, im_size) rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) # Reshape to original aspect ratio for later visualization # The resized version is used, to visualize what kind of resolution # the network has to work with. to_draw = cv2.resize(resized, (int(self.input_shape[0]*vidar), self.input_shape[1])) # Use model to predict inputs = [image.img_to_array(rgb)] tmp_inp = np.array(inputs) x = preprocess_input(tmp_inp) y = self.model.predict(x) # This line creates a new TensorFlow device every time. Is there a # way to avoid that? results = self.bbox_util.detection_out(y) if len(results) > 0 and len(results[0]) > 0: # Interpret output, only one frame is used det_label = results[0][:, 0] det_conf = results[0][:, 1] det_xmin = results[0][:, 2] det_ymin = results[0][:, 3] det_xmax = results[0][:, 4] det_ymax = results[0][:, 5] top_indices = [i for i, conf in enumerate(det_conf) if conf >= conf_thresh] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] for i in range(top_conf.shape[0]): xmin = int(round(top_xmin[i] * to_draw.shape[1])) ymin = int(round(top_ymin[i] * to_draw.shape[0])) xmax = int(round(top_xmax[i] * to_draw.shape[1])) ymax = int(round(top_ymax[i] * to_draw.shape[0])) # Draw the box on top of the to_draw image class_num = int(top_label_indices[i]) cv2.rectangle(to_draw, (xmin, ymin), (xmax, ymax), self.class_colors[class_num], 2) text = self.class_names[class_num] + " " + ('%.2f' % top_conf[i]) text_top = (xmin, ymin-10) text_bot = (xmin + 80, ymin + 5) text_pos = (xmin + 5, ymin) cv2.rectangle(to_draw, text_top, text_bot, self.class_colors[class_num], -1) cv2.putText(to_draw, text, text_pos, cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0,0,0), 1) # print(text) # Calculate FPS # This computes FPS for everything, not just the model's execution # which may or may not be what you want curr_time = timer() exec_time = curr_time - prev_time prev_time = curr_time accum_time = accum_time + exec_time curr_fps = curr_fps + 1 if accum_time > 1: accum_time = accum_time - 1 fps = "FPS: " + str(curr_fps) curr_fps = 0 # Draw FPS in top left corner cv2.rectangle(to_draw, (0,0), (50, 17), (255,255,255), -1) cv2.putText(to_draw, fps, (3,10), cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0,0,0), 1) cv2.imshow("SSD result", to_draw) cv2.waitKey(10) # print(text) ########################################## cv2.imwrite(".\video\\pic\\frame_" + str('{0:04d}'.format(num_frame)) +".png", to_draw) ########################################## num_frame+=1
def dtc_predict_py_edit( predict_dir, predicted_dir, dict, model_path, conf_threshold=0.6, is_conf_threshold_down=False, class_model=None, dict_class={ 0.0: "Car", 1.0: "Bicycle", 2.0: "Pedestrian", 3.0: "Signal", 4.0: "Signs", 5.0: "Truck" }, img_height=331, img_width=331, is_overwrite=False, max_box=100 #None , min_top_indices=0, fontsize=4, linewidth=0.5): """ dtc_predict.py を一部変更した関数 指定ディレクトリの画像1件ずつpredict実行し、バウンティングボックス付きの画像出力 predictの位置や予測ラベルを書いたデータフレームも作成する Args: predict_dir : 予測したい画像がはいってるディレクトリ predicted_dir : 予測した画像出力先ディレクトリ dict : 予測クラスのidとクラス名の辞書型データ 例:dict = {0.0:"other", 1.0:"Bicycle", 2.0:"Pedestrian", 3.0:"Signal", 4.0:"Signs", 5.0:"Truck", 6.0:"Car"} model_path : ロードするモデルファイルのパス conf_threshold : 予測結果の確信度の閾値 is_conf_threshold_down : 検出が出るまで予測結果の確信度の閾値を下げるかのフラグ class_model : 検出した領域をSSD以外のモデルで再予測する分類モデルオブジェクト dict_class : 再予測する分類モデルのクラスのidとクラス名の辞書型データ img_height, img_width : 再予測する分類モデルの入力画像サイズ(modelのデフォルトのサイズである必要あり) is_overwrite : 出力先に同名ファイルあればpredictしないかどうか max_box : 1画像で検出する領域の最大数。Noneなら制限なし。100なら100個まで検出 min_top_indices : 最小でもmin_top_indices+1個は検出する。デフォルトの0なら最低1個は検出。is_conf_threshold_down=Trueでないと機能しない fontsize: 画像に表示する予測ラベルの文字の大きさ linewidth: 画像に表示する予測boxの線の太さ Return: なし(予測した画像出力、予測結果のデータフレーム出力(pred.csv)) """ num_classes = len(dict) #6+1 # 検出したとする確信度のしきい値 #conf_threshold = 0.6#0.5#0.7 # 予測する画像が入っているフォルダ #predict_dir = r'C:\Users\shingo\jupyter_notebook\tfgpu_py36_work\AI_Edge_Contest\object_detection\SSD_classes_py\all_SSD_module\SSD\ssd_train' # 予測する画像のパス一覧 img_path_list = glob.glob(os.path.join(predict_dir, "*.*")) # 予測結果を保存するフォルダ ##predicted_dir = r'D:\work\AI_Edge_Contest\object_detect\object_detection\SSD_classes\predicted_images' if not os.path.isdir(predicted_dir): os.mkdir(predicted_dir) file_names = [] # ファイル名一覧 inputs = [] # ネットワークへ入力するため指定サイズに変形済みの画像データ images_h = [] # オリジナルサイズの画像の縦幅 images_w = [] # オリジナルサイズの画像の横幅 images = [] # 結果を見るためのオリジナルサイズの画像データ correctpred_filecount = 0 # メニュー辞書作成 #dict = {0.0:"other", 1.0:"Bicycle", 2.0:"Pedestrian", 3.0:"Signal", 4.0:"Signs", 5.0:"Truck", 6.0:"Car"} # モデルロード model = create_model(num_classes) #model.load_weights(r'D:\work\AI_Edge_Contest\object_detect\object_detection\SSD_classes\weight_ssd_best.hdf5') model.load_weights(model_path) print(model) import pandas as pd # 空のデータフレーム作成 pred_df = pd.DataFrame( index=[], columns=['file_names', 'conf', 'label_name', 'x', 'y', 'x+w', 'y+h']) # ---- json用 ---- prediction = {} # ---------------- # 画像情報1件ずつ取得 for path in tqdm(img_path_list): # 出力先に同名ファイルあればpredictしない if is_overwrite == False and os.path.isfile( os.path.join(predicted_dir, os.path.basename(path))): continue file_names = [] file_names.append(os.path.basename(path)) #print(file_names) # ---- json用 ---- img_name = os.path.basename(path) prediction[img_name] = {} # ---------------- img, height, width = load_img(path, target_size=input_shape) img = image.img_to_array(img) inputs = [] inputs.append(img.copy()) images_h = [] images_h.append(height) images_w = [] images_w.append(width) images = [] temp_image = imread(path) images.append(temp_image.copy()) # 入力画像前処理 inputs = preprocess_input(np.array(inputs)) #print(inputs.shape) # 予測実行 pred_results = model.predict(inputs, batch_size=1, verbose=0) #print(pred_results) bbox_util = BBoxUtility(num_classes) #print(bbox_util) bbox_results = bbox_util.detection_out(pred_results) #print(bbox_results) for file_no in range(len(file_names)): #for file_no in range(100): #print('-----------', file_names[file_no], '-----------') # 元の画像を描画 plt.imshow(images[file_no] / 255.) # 予想したボックスの情報を取得 bbox_label = bbox_results[file_no][:, 0] bbox_conf = bbox_results[file_no][:, 1] bbox_xmin = bbox_results[file_no][:, 2] bbox_ymin = bbox_results[file_no][:, 3] bbox_xmax = bbox_results[file_no][:, 4] bbox_ymax = bbox_results[file_no][:, 5] # 確信度がしきい値以上のボックスのみ抽出 top_indices = [ i for i, conf in enumerate(bbox_conf) if conf > conf_threshold ] # --------- len(top_indices) > min_top_indices になるまでconf_threshold 下げるか -------------------- if is_conf_threshold_down == True: conf_threshold_change = 0.0 if len(top_indices) == 0: # 基準のconf_threshold で検出なければ、検出でるまで閾値下げる for conf_threshold_i in range(int(conf_threshold // 0.01)): conf_threshold_change = conf_threshold - ( (conf_threshold_i + 1) * 0.01) top_indices = [ i for i, conf in enumerate(bbox_conf) if conf > conf_threshold_change ] if len(top_indices) > min_top_indices: #print('conf_threshold_i :', conf_threshold_i) break #continue #print('len(top_indices) :', len(top_indices)) #print('conf_threshold_change :', conf_threshold_change) # ----------------------------------------------------------------------------------- img_h = images_h[file_no] img_w = images_w[file_no] currentAxis = plt.gca() for box_no, top_index in enumerate(top_indices): # 検出数の最大値超えたらcontinue #( AI_Edge_Contest では1画像に100件までの制限あるため) if (max_box is not None) and (box_no >= max_box): continue # 予想したボックスを作成 label = bbox_label[top_index] #print('label:', label) x = int(bbox_xmin[top_index] * img_w) y = int(bbox_ymin[top_index] * img_h) w = int((bbox_xmax[top_index] - bbox_xmin[top_index]) * img_w) h = int((bbox_ymax[top_index] - bbox_ymin[top_index]) * img_h) box = (x, y), w, h # 予想したボックスを描画 conf = float(bbox_conf[top_index]) label_name = dict[label] # -------------------- 分類モデルで予測 -------------------- # 検出に加えるかのフラグ is_inclode = True if class_model is not None: if conf < conf_threshold: # (ndarray型の画像データから)検出領域切り出し # ndarray型の切り出しは[y:y_max,x:x_max]の順番じゃないとおかしくなる # https://qiita.com/tadOne/items/8967f046ca395669329d tmp_img = images[file_no] dst = tmp_img[y:y + h, x:x + w] # ここで画像表示すると、bbox付き画像保存されない.あくまで確認用 #plt.imshow(dst / 255.) #plt.show() #print('file_names :', file_names[file_no]) #print('label_name :', label_name) #print('conf :', conf) # 切り出し画像を分類モデルでpredict class_conf, class_label_id = predict_class_model( dst, class_model, img_height, img_width) #print('class_label_name :', dict_class[class_label_id]) #print('class_conf :', class_conf) # 分類モデルの方がスコア高ければ、ラベルとスコア書き換える if conf <= class_conf: label_name = dict_class[class_label_id] conf = float(class_conf) #elif top_index > 1: # # 検出数が1以上あってスコア低ければ検出に加えない # is_inclode = False # --------------------------------------------------------- # スコア低ければ検出に加えない if is_inclode == True: # 画像にbbox描画 display_txt = '{:0.2f}, {}'.format(conf, label_name) currentAxis.add_patch( plt.Rectangle(*box, fill=False, edgecolor=get_class_color(label), linewidth=linewidth)) currentAxis.text(x, y, display_txt, bbox={ 'facecolor': get_class_color(label), 'alpha': 0.2 }, fontsize=fontsize) # 結果をデータフレームで保持 series = pd.Series([ file_names[file_no], conf, label_name, x, y, x + w, y + h ], index=pred_df.columns) #print(series) pred_df = pred_df.append(series, ignore_index=True) #print(pred_df) # -------------------------- json用 -------------------------- if label_name not in prediction[img_name]: prediction[img_name][label_name] = [] prediction[img_name][label_name].append( [x, y, x + w, y + h]) #print(prediction) # ------------------------------------------------------------ # 予測結果の画像ファイルを保存 plt.savefig(os.path.join(predicted_dir, file_names[file_no]), dpi=300) plt.clf() output_dir = os.path.dirname(predicted_dir) pred_df.to_csv(os.path.join(output_dir, 'pred.csv'), sep='\t', index=False) # -------------------------- json用 -------------------------- with open(os.path.join(output_dir, 'pred.json'), 'w') as f: json.dump(prediction, f, indent=4) # インデント付けてjsonファイル出力
elif result[6] == None: print 'no image data string.' else: values = map(ord, list(result[6])) # Pepperから得られる画像情報は一列なのでxy,RGBにマッピング i = 0 for y in range(0, height): for x in range(0, width): image.itemset((y, x, 0), values[i + 0]) image.itemset((y, x, 1), values[i + 1]) image.itemset((y, x, 2), values[i + 2]) i += 3 image = cv2.resize(image, (300, 300)) #cv2.imwrite("input.jpg",frame) # kerasに与えるための画像の前処理 input_image = [imgprocess.img_to_array(image)] input_image = preprocess_input(np.array(input_image)) prediction = model.predict(input_image) # 実行 results = bbox_util.detection_out(prediction) # BBOXとして出力を処理 result_image = draw_bbox_from_results(image, results) # BBOXを画像に描画 cv2.imshow("pepper-camera-ssd", image) # 画像表示 k = cv2.waitKey(5) if k == ord('q'): break videoDevice.unsubscribe(nameID)
def run_camera(input_shape, model, root_path, action_class, frame_number): num_classes = 21 conf_thresh = 0.6 bbox_util = BBoxUtility(num_classes) class_colors = [] for i in range(0, num_classes): hue = 255 * i / num_classes col = np.zeros((1, 1, 3)).astype("uint8") col[0][0][0] = hue col[0][0][1] = 128 # Saturation col[0][0][2] = 255 # Value cvcol = cv2.cvtColor(col, cv2.COLOR_HSV2BGR) col = (int(cvcol[0][0][0]), int(cvcol[0][0][1]), int(cvcol[0][0][2])) class_colors.append(col) vid = cv2.VideoCapture(0) sleep(2) # Compute aspect ratio of video vidw = vid.get(cv2.CAP_PROP_FRAME_WIDTH) vidh = vid.get(cv2.CAP_PROP_FRAME_HEIGHT) # vidar = vidw / vidh crop_path = root_path + 'crop/' + action_class origin_path = root_path + 'origin/' + action_class mask_path = root_path + 'mask/' + action_class samples = os.listdir(origin_path) sample_count = len(samples) while True: retval, orig_image = vid.read() if not retval: print("Done!") return None im_size = (input_shape[0], input_shape[1]) resized = cv2.resize(orig_image, im_size) rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) inputs = [image.img_to_array(rgb)] tmp_inp = np.array(inputs) x = preprocess_input(tmp_inp) y = model.predict(x) results = bbox_util.detection_out(y) if len(results) > 0 and len(results[0]) > 0: det_label = results[0][:, 0] det_conf = results[0][:, 1] det_xmin = results[0][:, 2] det_ymin = results[0][:, 3] det_xmax = results[0][:, 4] det_ymax = results[0][:, 5] top_indices = [ i for i, conf in enumerate(det_conf) if conf >= conf_thresh ] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] if 15 not in top_label_indices: detected = False else: detected = True for i in range(top_conf.shape[0]): xmin = int(round((top_xmin[i] * vidw) * 0.9)) ymin = int(round((top_ymin[i] * vidh) * 0.9)) xmax = int(round( (top_xmax[i] * vidw) * 1.1)) if int(round( (top_xmax[i] * vidw) * 1.1)) <= vidw else int( round(top_xmax[i] * vidw)) ymax = int(round( (top_ymax[i] * vidh) * 1.1)) if int(round( (top_ymax[i] * vidh) * 1.1)) <= vidh else int( round(top_ymax[i] * vidh)) # save frames class_num = int(top_label_indices[i]) if class_num == 15: frame = copy.deepcopy(orig_image) cv2.rectangle(orig_image, (xmin, ymin), (xmax, ymax), class_colors[class_num], 2) curl = np.zeros_like(frame, dtype='uint8') curl[ymin:ymax, xmin:xmax, :] = frame[ymin:ymax, xmin:xmax, :] crop = cv2.resize(frame[ymin:ymax, xmin:xmax, :], (64, 96)) curl = cv2.resize(curl, (160, 120)) frame = cv2.resize(frame, (160, 120)) else: detected = False cv2.imshow("SSD result", orig_image) if cv2.waitKey(5) & 0xFF == ord('s') and detected: sample_count += 1 cv2.imwrite(crop_path + '/' + str(sample_count + 10000) + '.jpg', crop) print('saving ' + crop_path + '/' + str(sample_count + 10000) + '.jpg') cv2.imwrite(origin_path + '/' + str(sample_count + 10000) + '.jpg', frame) print('saving ' + origin_path + '/' + str(sample_count + 10000) + '.jpg') cv2.imwrite(mask_path + '/' + str(sample_count + 10000) + '.jpg', curl) print('saving ' + mask_path + '/' + str(sample_count + 10000) + '.jpg')
class ssdKeras(): def __init__(self): #self.node_name = "ssd_keras" #rospy.init_node(self.node_name) self.class_names = [ "background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor" ] self.num_classes = len(self.class_names) self.input_shape = (300, 300, 3) self.model = SSD(self.input_shape, num_classes=self.num_classes) self.model.load_weights( '/home/abdulrahman/catkin_ws/src/victim_localization/resources/ssd_keras/weights_SSD300.hdf5' ) self.bbox_util = BBoxUtility(self.num_classes) self.conf_thresh = 0.4 self.model._make_predict_function() self.graph = tf.get_default_graph() self.detection_index = DL_msgs_boxes() # Create unique and somewhat visually distinguishable bright # colors for the different classes. self.class_colors = [] for i in range(0, self.num_classes): # This can probably be written in a more elegant manner hue = 255 * i / self.num_classes col = np.zeros((1, 1, 3)).astype("uint8") col[0][0][0] = hue col[0][0][1] = 128 # Saturation col[0][0][2] = 255 # Value cvcol = cv2.cvtColor(col, cv2.COLOR_HSV2BGR) col = (int(cvcol[0][0][0]), int(cvcol[0][0][1]), int(cvcol[0][0][2])) self.class_colors.append(col) self.bridge = CvBridge() # Create the cv_bridge object self.image_sub = rospy.Subscriber( "front_cam/rgb/image_raw", Image, self.detect_image, queue_size=1) # the appropriate callbacks self.box_coordinate_pub = rospy.Publisher( "/ssd_detction/box", DL_msgs_boxes, queue_size=5) # the appropriate callbacks def detect_image(self, ros_image): """ Runs the test on a video (or webcam) # Arguments conf_thresh: Threshold of confidence. Any boxes with lower confidence are not visualized. """ #### Use cv_bridge() to convert the ROS image to OpenCV format #### try: image_orig = self.bridge.imgmsg_to_cv2(ros_image, "bgr8") except CvBridgeError as e: print(e) ########## vidw = 640.0 # change from cv2.cv.CV_CAP_PROP_FRAME_WIDTH vidh = 480.0 # change from cv2.cv.CV_CAP_PROP_FRAME_HEIGHT vidar = vidw / vidh #print(type(image_orig)) im_size = (self.input_shape[0], self.input_shape[1]) resized = cv2.resize(image_orig, im_size) rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) # Reshape to original aspect ratio for later visualization # The resized version is used, to visualize what kind of resolution # the network has to work with. to_draw = cv2.resize(resized, (640, 480)) # Use model to predict inputs = [image.img_to_array(rgb)] tmp_inp = np.array(inputs) x = preprocess_input(tmp_inp) start_time = time.time() #debuggin with self.graph.as_default(): y = self.model.predict(x) #print("--- %s seconds_for_one_image ---" % (time.time() - start_time)) # This line creates a new TensorFlow device every time. Is there a # way to avoid that? results = self.bbox_util.detection_out(y) if len(results) > 0 and len(results[0]) > 0: # Interpret output, only one frame is used det_label = results[0][:, 0] det_conf = results[0][:, 1] det_xmin = results[0][:, 2] det_ymin = results[0][:, 3] det_xmax = results[0][:, 4] det_ymax = results[0][:, 5] top_indices = [ i for i, conf in enumerate(det_conf) if conf >= self.conf_thresh ] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] #initiaze the detection msgs box_msg = DL_msgs_box() box_msg.xmin = 0 box_msg.ymin = 0 box_msg.xmax = 0 box_msg.ymax = 0 box_msg.Class = "Non" # 100 reflect a non-class value self.detection_index.boxes.append(box_msg) print(top_xmin) for i in range(top_conf.shape[0]): self.detection_index.boxes[:] = [] xmin = int(round(top_xmin[i] * to_draw.shape[1])) ymin = int(round(top_ymin[i] * to_draw.shape[0])) xmax = int(round(top_xmax[i] * to_draw.shape[1])) ymax = int(round(top_ymax[i] * to_draw.shape[0])) #include the corner to be published box_msg = DL_msgs_box() box_msg.xmin = xmin box_msg.ymin = ymin box_msg.xmax = xmax box_msg.ymax = ymax box_msg.Class = self.class_names[int(top_label_indices[i])] self.detection_index.boxes.append(box_msg) # Draw the box on top of the to_draw image class_num = int(top_label_indices[i]) if (self.class_names[class_num] == "person"): cv2.rectangle(to_draw, (xmin, ymin), (xmax, ymax), self.class_colors[class_num], 2) text = self.class_names[class_num] + " " + ('%.2f' % top_conf[i]) text_top = (xmin, ymin - 10) text_bot = (xmin + 80, ymin + 5) text_pos = (xmin + 5, ymin) cv2.rectangle(to_draw, text_top, text_bot, self.class_colors[class_num], -1) cv2.putText(to_draw, text, text_pos, cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0, 0, 0), 1) #cv2.circle(to_draw, (xmax, ymax),1,self.class_colors[class_num],30); self.detection_index.header = std_msgs.msg.Header() self.detection_index.header.stamp = rospy.Time.now() print(self.detection_index) self.box_coordinate_pub.publish(self.detection_index) self.detection_index.boxes[:] = [] #self.detection_index.boxes.clear() cv2.imshow("SSD result", to_draw) cv2.waitKey(1) def main(self): rospy.spin()
class SSD: def __init__(self, input_shape = (300, 300, 3)): self.num_class = config.NUM_CLASSES self.input_tensor = tf.placeholder(tf.float32, [None, input_shape[0], input_shape[1], input_shape[2]]) self.label_tensor = tf.placeholder(tf.float32, [None, 7308, 4 + config.NUM_CLASSES + 8]) self.predicts = self.build(input_shape, config.NUM_CLASSES) self.input_shape = input_shape self.global_step = tf.train.create_global_step() var_list = tf.global_variables() var_list = [var for var in var_list if "Adam" not in var.name] self.saver = tf.train.Saver(var_list, max_to_keep=1) self.bbox_util = BBoxUtility(self.num_class) def build(self, input_shape, num_classes): img_size = (input_shape[1], input_shape[0]) #300 conv1_1 = tf.layers.conv2d(self.input_tensor, 64, 3, name = "conv1_1", padding = "same", activation = activation) self.conv1_1 = conv1_1 conv1_2 = tf.layers.conv2d(conv1_1, 64, 3, name = "conv1_2", padding = "same", activation = activation) pool1 = tf.layers.max_pooling2d(conv1_2, pool_size = 2, strides = 2, padding = "same") #150 conv2_1 = tf.layers.conv2d(pool1, 128, 3, name = "conv2_1", padding = "same", activation = activation) conv2_2 = tf.layers.conv2d(conv2_1, 128, 3, name = "conv2_2", padding = "same", activation = activation) pool2 = tf.layers.max_pooling2d(conv2_2, pool_size = 2, strides = 2, padding = "same") #75 conv3_1 = tf.layers.conv2d(pool2, 256, 3, name = "conv3_1", padding = "same", activation = activation) conv3_2 = tf.layers.conv2d(conv3_1, 256, 3, name = "conv3_2", padding = "same", activation = activation) conv3_3 = tf.layers.conv2d(conv3_2, 256, 3, name = "conv3_3", padding = "same", activation = activation) pool3 = tf.layers.max_pooling2d(conv3_3, pool_size = 2, strides = 2, padding = "same") #38 conv4_1 = tf.layers.conv2d(pool3, 512, 3, name = "conv4_1", padding = "same", activation = activation) conv4_2 = tf.layers.conv2d(conv4_1, 512, 3, name = "conv4_2", padding = "same", activation = activation) conv4_3 = tf.layers.conv2d(conv4_2, 512, 3, name = "conv4_3", padding = "same", activation = activation) pool4 = tf.layers.max_pooling2d(conv4_3, pool_size = 2, strides = 2, padding = "same") #19 conv5_1 = tf.layers.conv2d(pool4, 512, 3, name = "conv5_1", padding = "same", activation = activation) conv5_2 = tf.layers.conv2d(conv5_1, 512, 3, name = "conv5_2", padding = "same", activation = activation) conv5_3 = tf.layers.conv2d(conv5_2, 512, 3, name = "conv5_3", padding = "same", activation = activation) pool5 = tf.layers.max_pooling2d(conv5_3, pool_size = 3, strides = 1, padding = "same") #19 fc6_kernel = tf.get_variable(name = "fc6/kernel", shape = (3, 3, 512, 1024), initializer = tf.truncated_normal_initializer(stddev=0.1)) fc6_bias = tf.get_variable(name = "fc6/bias", shape = [1024], initializer = tf.truncated_normal_initializer(stddev = 0.1)) fc6 = tf.nn.atrous_conv2d(pool5, fc6_kernel, rate = 6, padding = "SAME", name = "fc6") fc6 = tf.nn.bias_add(fc6, fc6_bias) fc6 = activation(fc6) fc7 = tf.layers.conv2d(fc6, 1024, 1, name = "fc7", padding = "same", activation = activation) conv6_1 = tf.layers.conv2d(fc7, 256, 1, name = "conv6_1", padding = "same", activation = activation) conv6_2 = tf.layers.conv2d(conv6_1, 512, 3, name = "conv6_2", strides = (2,2), padding = "same", activation = activation) #10 conv7_1 = tf.layers.conv2d(conv6_2, 128, 1, name = "conv7_1", padding = "same", activation = activation) conv7_2 = tf.keras.layers.ZeroPadding2D()(conv7_1) conv7_2 = tf.layers.conv2d(conv7_2, 256, 3, name = "conv7_2", padding = "valid", strides = (2,2), activation = activation) #5 conv8_1 = tf.layers.conv2d(conv7_2, 128, 1, name = "conv8_1", padding = "same", activation = activation) conv8_2 = tf.layers.conv2d(conv8_1, 256, 3, name = "conv8_2", padding = "same", strides = (2,2), activation = activation) #3 pool6 = tf.keras.layers.GlobalAveragePooling2D(name='pool6')(conv8_2) #1 num_priors = 3 conv4_3_norm = self.normalize_layer(conv4_3, 20, 512, "conv4_3_norm") conv4_3_norm_mbox_loc = tf.layers.conv2d(conv4_3_norm, num_priors * 4, 3, name = "conv4_3_norm_mbox_loc", padding = "same") conv4_3_norm_mbox_loc_flat = tf.layers.flatten(conv4_3_norm_mbox_loc) name = "conv4_3_norm_mbox_conf" if num_classes!=21: name+="_"+str(num_classes) conv4_3_norm_mbox_conf = tf.layers.conv2d(conv4_3_norm, num_priors * num_classes, 3, name = name, padding = "same") conv4_3_norm_mbox_conf_flat = tf.layers.flatten(conv4_3_norm_mbox_conf) shape = [0, 38, 38, 512] conv4_3_norm_mbox_priorbox = self.priorBox_layer(conv4_3_norm, shape, img_size, 30.0, aspect_ratios=[2], variances=[0.1, 0.1, 0.2, 0.2], name='conv4_3_norm_mbox_priorbox') num_priors = 6 fc7_mbox_loc = tf.layers.conv2d(fc7, num_priors * 4, 3, name = "fc7_mbox_loc", padding = "same") fc7_mbox_loc_flat = tf.layers.flatten(fc7_mbox_loc) name = "fc7_mbox_conf" if num_classes!=21: name+="_"+str(num_classes) fc7_mbox_conf = tf.layers.conv2d(fc7, num_priors * num_classes, 3, name = name, padding = "same") fc7_mbox_conf_flat = tf.layers.flatten(fc7_mbox_conf) shape = [0, 19, 19, 1024] fc7_mbox_priorbox = self.priorBox_layer(fc7, shape, img_size, 60.0, max_size=114.0, aspect_ratios=[2, 3], variances=[0.1, 0.1, 0.2, 0.2], name='fc7_mbox_priorbox') num_priors = 6 conv6_2_mbox_loc = tf.layers.conv2d(conv6_2, num_priors * 4, 3, name = "conv6_2_mbox_loc", padding = "same") conv6_2_mbox_loc_flat = tf.layers.flatten(conv6_2_mbox_loc) name = "conv6_2_mbox_conf" if num_classes!=21: name+="_"+str(num_classes) conv6_2_mbox_conf = tf.layers.conv2d(conv6_2, num_priors * num_classes, 3, name = name, padding = "same") conv6_2_mbox_conf_flat = tf.layers.flatten(conv6_2_mbox_conf) shape = [0, 10, 10, 256] conv6_2_mbox_priorbox = self.priorBox_layer(conv6_2, shape, img_size, 114.0, max_size=168.0, aspect_ratios=[2, 3], variances=[0.1, 0.1, 0.2, 0.2], name='conv6_2_mbox_priorbox') num_priors = 6 conv7_2_mbox_loc = tf.layers.conv2d(conv7_2, num_priors * 4, 3, name = "conv7_2_mbox_loc", padding = "same") conv7_2_mbox_loc_flat = tf.layers.flatten(conv7_2_mbox_loc) name = "conv7_2_mbox_conf" if num_classes!=21: name+="_"+str(num_classes) conv7_2_mbox_conf = tf.layers.conv2d(conv7_2, num_priors * num_classes, 3, name = name, padding = "same") conv7_2_mbox_conf_flat = tf.layers.flatten(conv7_2_mbox_conf) shape = [0, 5, 5, 256] conv7_2_mbox_priorbox = self.priorBox_layer(conv7_2, shape, img_size, 168.0, max_size=222.0, aspect_ratios=[2, 3], variances=[0.1, 0.1, 0.2, 0.2], name='conv7_2_mbox_priorbox') num_priors = 6 conv8_2_mbox_loc = tf.layers.conv2d(conv8_2, num_priors * 4, 3, name = "conv8_2_mbox_loc", padding = "same") conv8_2_mbox_loc_flat = tf.layers.flatten(conv8_2_mbox_loc) name = "conv8_2_mbox_conf" if num_classes!=21: name+="_"+str(num_classes) conv8_2_mbox_conf = tf.layers.conv2d(conv8_2, num_priors * num_classes, 3, name = name, padding = "same") conv8_2_mbox_conf_flat = tf.layers.flatten(conv8_2_mbox_conf) shape = [0, 3, 3, 256] conv8_2_mbox_priorbox = self.priorBox_layer(conv8_2, shape, img_size, 222.0, max_size=276.0, aspect_ratios=[2, 3], variances=[0.1, 0.1, 0.2, 0.2], name='conv8_2_mbox_priorbox') num_priors = 6 pool6_mbox_loc_flat = tf.layers.dense(pool6, units = num_priors * 4, name='pool6_mbox_loc_flat') name = "pool6_mbox_conf_flat" if num_classes!=21: name+="_"+str(num_classes) pool6_mbox_conf_flat = tf.layers.dense(pool6, units = num_priors * num_classes, name=name) shape = [0, 1, 1, 256] pool6_mbox_priorbox = self.priorBox_layer(tf.reshape(pool6, (-1, 1, 1, 256)), shape, img_size, 276.0, max_size=330.0, aspect_ratios=[2, 3], variances=[0.1, 0.1, 0.2, 0.2], name='pool6_mbox_priorbox') mbox_loc = tf.concat([conv4_3_norm_mbox_loc_flat, fc7_mbox_loc_flat, conv6_2_mbox_loc_flat, conv7_2_mbox_loc_flat, conv8_2_mbox_loc_flat, pool6_mbox_loc_flat], axis = 1) mbox_conf = tf.concat([conv4_3_norm_mbox_conf_flat, fc7_mbox_conf_flat, conv6_2_mbox_conf_flat, conv7_2_mbox_conf_flat, conv8_2_mbox_conf_flat, pool6_mbox_conf_flat], axis = 1) mbox_priorbox = tf.concat([conv4_3_norm_mbox_priorbox, fc7_mbox_priorbox, conv6_2_mbox_priorbox, conv7_2_mbox_priorbox, conv8_2_mbox_priorbox, pool6_mbox_priorbox], axis=1) mbox_priorbox = tf.cast(mbox_priorbox, tf.float32) num_boxes = tf.shape(mbox_loc)[-1]//4 mbox_loc = tf.reshape(mbox_loc, (-1, num_boxes, 4)) mbox_conf = tf.reshape(mbox_conf, (-1, num_boxes, num_classes)) mbox_conf = tf.nn.softmax(mbox_conf) predictions = tf.concat([mbox_loc, mbox_conf, mbox_priorbox], axis = 2) return predictions def normalize_layer(self, net, init_scale, shape=512, name = None): init_scale = init_scale * np.ones(shape) scale = tf.Variable(init_scale, name = name, dtype = tf.float32) return scale * tf.nn.l2_normalize(net, 3) def priorBox_layer(self, net, input_shape, img_size, min_size, max_size=None, aspect_ratios=None, flip=True, variances=[0.1], clip=True, name = None): aspect_ratios_ = [1.0] if max_size: if max_size < min_size: raise Exception('max_size must be greater than min_size.') aspect_ratios_.append(1.0) if aspect_ratios: for ar in aspect_ratios: if ar in aspect_ratios_: continue aspect_ratios_.append(ar) if flip: aspect_ratios_.append(1.0 / ar) variances = np.array(variances) layer_width = input_shape[2] layer_height = input_shape[1] img_width = img_size[0] img_height = img_size[1] box_widths = [] box_heights = [] for ar in aspect_ratios_: if ar == 1 and len(box_widths) == 0: box_widths.append(min_size) box_heights.append(min_size) elif ar == 1 and len(box_widths) > 0: box_widths.append(np.sqrt(min_size * max_size)) box_heights.append(np.sqrt(min_size * max_size)) elif ar != 1: box_widths.append(min_size * np.sqrt(ar)) box_heights.append(min_size / np.sqrt(ar)) box_widths = 0.5 * np.array(box_widths) box_heights = 0.5 * np.array(box_heights) step_x = img_width / layer_width step_y = img_height / layer_height linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x, layer_width) liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y, layer_height) centers_x, centers_y = np.meshgrid(linx, liny) centers_x = centers_x.reshape(-1, 1) centers_y = centers_y.reshape(-1, 1) num_priors_ = len(aspect_ratios_) prior_boxes = np.concatenate((centers_x, centers_y), axis=1) prior_boxes = np.tile(prior_boxes, (1, 2 * num_priors_)) prior_boxes[:, ::4] -= box_widths prior_boxes[:, 1::4] -= box_heights prior_boxes[:, 2::4] += box_widths prior_boxes[:, 3::4] += box_heights prior_boxes[:, ::2] /= img_width prior_boxes[:, 1::2] /= img_height prior_boxes = prior_boxes.reshape(-1, 4) if clip: prior_boxes = np.minimum(np.maximum(prior_boxes, 0.0), 1.0) num_boxes = len(prior_boxes) if len(variances) == 1: variances = np.ones((num_boxes, 4)) * variances[0] elif len(variances) == 4: variances = np.tile(variances, (num_boxes, 1)) else: raise Exception('Must provide one or four variances.') prior_boxes = np.concatenate((prior_boxes, variances), axis=1) prior_boxes_tensor = tf.expand_dims(tf.Variable(prior_boxes, name = name), 0) pattern = [tf.shape(net)[0], 1, 1] prior_boxes_tensor = tf.tile(prior_boxes_tensor, pattern) return prior_boxes_tensor def restore(self, sess): checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoint) if checkpoint: print("restore from: " + checkpoint) self.saver.restore(sess, checkpoint) ### if you don't want to use this pretrained weights and don't want to install h5py, you can comment this block ## elif os.path.exists('weights_SSD300.hdf5'): print("restore from pretrained weights") tf_variables = {} ops = [] for variables in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES): if "Adam" in variables.name or "RMS" in variables.name: continue key = variables.name.split("/")[0].split(":")[0] if key not in tf_variables: tf_variables[key] = [variables] else: tf_variables[key].append(variables) with h5py.File('weights_SSD300.hdf5','r') as f: for k in f.keys(): if k in tf_variables: nn = 0 for kk in f[k].keys(): a = np.array(f[k][kk]) ops.append(tf_variables[k][nn].assign(a)) nn+=1 sess.run(ops) ######################################## end #################################################################### elif os.path.exists("vgg16.npy"): print("restore from vgg weights.") vgg = np.load("vgg16.npy", encoding='latin1').item() ops = [] vgg_dict = ["conv1_1", "conv1_2", "conv2_1", "conv2_2", "conv3_1", "conv3_2", "conv3_3", "conv4_1", "conv4_2", "conv4_3", "conv5_1","conv5_2","conv5_3"] tf_variables = {} for variables in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES): if "Adam" or "RMS" in variables.name: continue key = variables.name.split("/")[0].split(":")[0] if key not in vgg_dict: continue if key not in tf_variables: tf_variables[key] = [variables] ops.append(variables.assign(vgg[key][0])) else: tf_variables[key].append(variables) ops.append(variables.assign(vgg[key][1])) sess.run(ops) else: print("train from scratch.") def train(self): self.loss = MultiboxLoss(self.num_class, neg_pos_ratio=2.0).compute_loss(self.label_tensor, self.predicts) self.loss_avg = tf.reduce_mean(self.loss) learning_rate = tf.train.exponential_decay(config.lr, self.global_step, 10000 ,0.9, True, name='learning_rate') self.train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss, global_step = self.global_step) self.train_loss_summary = tf.summary.scalar("loss_train", self.loss_avg) self.val_loss_summary = tf.summary.scalar("loss_val", self.loss_avg) self.writer = tf.summary.FileWriter(FLAGS.checkpoint) priors = pickle.load(open('prior_boxes_ssd300.pkl', 'rb')) self.bbox_util = BBoxUtility(self.num_class, priors) gt = pickle.load(open(FLAGS.label_file, 'rb')) keys = sorted(gt.keys()) num_train = int(round(0.8 * len(keys))) train_keys = keys[:num_train] val_keys = keys[num_train:] gen = Generator(gt, self.bbox_util, config.BATCH_SIZE, FLAGS.images_dir, train_keys, val_keys, (self.input_shape[0], self.input_shape[1]))#, do_crop=False, saturation_var = 0, brightness_var = 0, contrast_var = 0, lighting_std = 0, hflip_prob = 0, vflip_prob = 0) c = tf.ConfigProto() c.gpu_options.allow_growth = True with tf.Session(config=c) as sess: sess.run(tf.global_variables_initializer()) self.writer.add_graph(sess.graph) self.restore(sess) for inputs, labels in gen.generate(True): _, lo, step, summary = sess.run([self.train_op, self.loss_avg, self.global_step, self.train_loss_summary], feed_dict = {self.input_tensor: inputs, self.label_tensor: labels}) sys.stdout.write("train loss: %d %.3f \r"%(step, lo)) sys.stdout.flush() self.writer.add_summary(summary, step) if step % config.save_step == config.save_step - 1: self.saver.save(sess, os.path.join(FLAGS.checkpoint, "ckpt"), global_step=self.global_step) print("saved") if step % config.snapshot_step == 0: val_in, val_la = next(gen.generate(False)) lo, s, preds = sess.run([self.loss_avg, self.train_loss_summary, self.predicts], feed_dict = {self.input_tensor: val_in, self.label_tensor: val_la}) self.writer.add_summary(s, step) print("val loss:", step, lo) images = [np.array(val_in[v]) for v in range(val_in.shape[0])] self.paint_imgs(preds, images) print("Train finished. Checkpoint saved in", FLAGS.checkpoint) def predict(self): inputs = [] images = [] file_name = [] file_list = os.listdir(FLAGS.images_dir) for file in file_list: img_path = os.path.join(FLAGS.images_dir, file) img = cv2.imread(img_path) images.append(img.copy()) img = cv2.resize(img, (300, 300)).astype(np.float32) inputs.append(img) file_name.append(file) inputs = np.array(inputs) inputs = preprocess_input(np.array(inputs)) c = tf.ConfigProto() c.gpu_options.allow_growth = True with tf.Session(config=c) as sess: init = tf.global_variables_initializer() sess.run(init) self.restore(sess) #todo batch preds = sess.run(self.predicts, feed_dict = {self.input_tensor: inputs}) self.paint_imgs(preds, images, file_name) print("Finished. Images saved in " + FLAGS.eval_output_dir) def paint_imgs(self, preds, images, file_name=None): results = self.bbox_util.detection_out(preds) for j, img in enumerate(images): # Parse the outputs. det_label = results[j][:, 0] det_conf = results[j][:, 1] det_xmin = results[j][:, 2] det_ymin = results[j][:, 3] det_xmax = results[j][:, 4] det_ymax = results[j][:, 5] # Get detections with confidence higher than config.visual_threshold. top_indices = [i for i, conf in enumerate(det_conf) if conf >= config.visual_threshold] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] for i in range(top_conf.shape[0]): xmin = int(round(top_xmin[i] * img.shape[1])) ymin = int(round(top_ymin[i] * img.shape[0])) xmax = int(round(top_xmax[i] * img.shape[1])) ymax = int(round(top_ymax[i] * img.shape[0])) score = top_conf[i] label = int(top_label_indices[i]) label_name = config.CLASS_NAMES[label - 1] display_txt = '{:0.2f}, {}'.format(score, label_name) coords = (xmin, ymin), xmax-xmin+1, ymax-ymin+1 cv2.rectangle(img, (xmin, ymin), (xmax, ymax), (255,0,0), 2) cv2.putText(img, display_txt, (xmin, ymin), cv2.FONT_HERSHEY_COMPLEX, 0.5, (255,255,64), 1) if not file_name: name = str(j)+".jpg" else: name = file_name[j] cv2.imwrite(os.path.join(FLAGS.eval_output_dir, name), img)
def classify(): class_colors = makeClassColors() voc_classes = [ 'Prescription', 'Bicycle', 'Bird', 'Boat', 'Bottle', 'Bus', 'Car', 'Cat', 'Chair', 'Cow', 'Diningtable', 'Dog', 'Horse', 'Motorbike', 'Person', 'Pottedplant', 'Sheep', 'Sofa', 'Train', 'Tvmonitor' ] NUM_CLASSES = len(voc_classes) + 1 input_shape = (300, 300, 3) model = SSD300(input_shape, num_classes=NUM_CLASSES) #model.load_weights('weights_SSD300.hdf5', by_name=True) weights_file = "./checkpoints/weights.00-1.25.hdf5" model.load_weights(weights_file, by_name=True) bbox_util = BBoxUtility(NUM_CLASSES) target_dir = "/Users/donchan/Documents/myData/miyuki/camera/None" #target_dir = "/Volumes/m1124/FTP/073010" #target_dir = "./pics" # load original image #files = glob.glob("/Volumes/m1124/FTP/073010/*.jpg") files = os.listdir(target_dir) np.random.shuffle(files) files = [os.path.join(target_dir, f) for f in files if ".jpg" in f] files = files[:10] logging.info("- " * 40) logging.info(files) logging.info("- " * 40) # build pipeline images for classification (original image size) pipeline_images = [mpimg.imread(file) for file in files] # load image for prediction (shrinked 300 x 300) image_load_ops = lambda x: image.load_img(x, target_size=(300, 300)) image_array_ops = lambda x: image.img_to_array(x) inputs = [] for x in files: img = image.load_img(x, target_size=(300, 300)) img = image.img_to_array(img) inputs.append(img.copy()) #inputs = list( map(image_load_ops, files) ) #inputs = list( map(image_array_ops, inputs) ) # keras module to look in class of data image logging.info(" keras model starting..... ") inputs = preprocess_input(np.array(inputs)) preds = model.predict(inputs, batch_size=1, verbose=1) results = bbox_util.detection_out(preds) logging.info("") logging.info("Now classification for every images.") for i, img in enumerate(pipeline_images): # Parse the outputs. to_draw = img.copy() det_label = results[i][:, 0] det_conf = results[i][:, 1] det_xmin = results[i][:, 2] det_ymin = results[i][:, 3] det_xmax = results[i][:, 4] det_ymax = results[i][:, 5] # Get detections with confidence higher than 0.6. top_indices = [i for i, conf in enumerate(det_conf) if conf >= 0.5] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist() plt.imshow(img / 255.) currentAxis = plt.gca() prescription_label_name = 0 for j in range(top_conf.shape[0]): xmin = int(round(top_xmin[j] * img.shape[1])) ymin = int(round(top_ymin[j] * img.shape[0])) xmax = int(round(top_xmax[j] * img.shape[1])) ymax = int(round(top_ymax[j] * img.shape[0])) score = top_conf[j] label = int(top_label_indices[j]) label_name = voc_classes[label - 1] if label_name == "Prescription": prescription_label_name = 1 display_txt = '{:0.2f}, {}'.format(score, label_name) coords = (xmin, ymin), xmax - xmin + 1, ymax - ymin + 1 color = colors[label] logging.info("object NO: %d %s" % ((j + 1), label_name)) logging.info("rectangle info: %s" % (coords, )) #logging.info(label_name,color) currentAxis.add_patch( plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2)) currentAxis.text(xmin, ymin, display_txt, bbox={ 'facecolor': color, 'alpha': 0.5 }) cv2.rectangle(to_draw, (xmin, ymin), (xmax, ymax), class_colors[label], 2) if prescription_label_name == 1: cv2.imwrite(os.path.join("./results", str(i) + '.jpg'), to_draw) plt.show()
# Capture frame-by-frame ret, img = cap.read() st = time.time() resized = cv2.resize(img, im_size) rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) to_draw = cv2.resize(resized, (int(input_shape[0] * imgar) * 3, input_shape[1] * 3)) # Use model to predict inputs = [image.img_to_array(rgb)] tmp_inp = np.array(inputs) x = preprocess_input(tmp_inp) y = model.predict(x) results = bbox_util.detection_out(y) if len(results) > 0 and len(results[0]) > 0: # Interpret output, only one frame is used det_label = results[0][:, 0] det_conf = results[0][:, 1] det_xmin = results[0][:, 2] det_ymin = results[0][:, 3] det_xmax = results[0][:, 4] det_ymax = results[0][:, 5] top_indices = [ i for i, conf in enumerate(det_conf) if conf >= conf_thresh ] top_conf = det_conf[top_indices]
class TLClassifier(object): def __init__(self): #TODO load classifier NUM_CLASSES = 3 + 1 input_shape = (300, 300, 3) # "prior boxes" in the paper priors = pickle.load(open('prior_boxes_ssd300.pkl', 'rb')) self.bbox_util = BBoxUtility(NUM_CLASSES, priors) self.model = SSD300(input_shape, num_classes=NUM_CLASSES) self.model.load_weights('weights.180314.hdf5', by_name=True) def get_classification(self, img): """Determines the color of the traffic light in the image Args: img (cv::Mat): image containing the traffic light assumed 3D numpy.array (800, 600, 3) bgr8: CV_8UC3, color image with blue-green-red color order Returns: int: ID of traffic light color (specified in styx_msgs/TrafficLight) """ img = imresize(img, (300, 300)) # convert color-order from cv2 to Pillow #B, G, R = img.T #img = np.array((R, G, B)).T img = image.img_to_array(img) inputs = np.reshape(img, (1, 300, 300, 3)) # 'inputs' expects this size inputs = preprocess_input(np.array(inputs)) preds = self.model.predict(inputs, batch_size=1, verbose=0) results = self.bbox_util.detection_out(preds) det_label = results[0][:, 0] det_conf = results[0][:, 1] det_xmin = results[0][:, 2] det_ymin = results[0][:, 3] det_xmax = results[0][:, 4] det_ymax = results[0][:, 5] # Get detections with confidence >= 0.8 top_indices = [j for j, conf in enumerate(det_conf) if conf >= 0.8] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() if top_label_indices == []: return TrafficLight.UNKNOWN, 0, 0, 0, 0, 0 top_xmin = det_xmin[top_indices][0] top_ymin = det_ymin[top_indices][0] top_xmax = det_xmax[top_indices][0] top_ymax = det_ymax[top_indices][0] score = top_conf[0] # assume only one signal detected label = int(top_label_indices[0]) if label == 0: return TrafficLight.UNKNOWN, 0, 0, 0, 0, 0 elif label == 1: return TrafficLight.RED, score, top_xmin, top_ymin, top_xmax, top_ymax elif label == 2: return TrafficLight.YELLOW, score, top_xmin, top_ymin, top_xmax, top_ymax elif label == 3: return TrafficLight.GREEN, score, top_xmin, top_ymin, top_xmax, top_ymax else: return TrafficLight.UNKNOWN, score, top_xmin, top_ymin, top_xmax, top_ymax
class UseSSD: def __init__(self): self.image_width = 300 self.image_height = 300 self.voc_classes = [ 'Aeroplane', 'Bicycle', 'Bird', 'Boat', 'Bottle', 'Bus', 'Car', 'Cat', 'Chair', 'Cow', 'Diningtable', 'Dog', 'Horse', 'Motorbike', 'Person', 'Pottedplant', 'Sheep', 'Sofa', 'Train', 'Tvmonitor' ] self.NUM_CLASSES = len(self.voc_classes) + 1 self.model = SSD300((self.image_height, self.image_width, 3), num_classes=self.NUM_CLASSES) self.model.load_weights('weights_SSD300.hdf5', by_name=True) self.bbox_util = BBoxUtility(self.NUM_CLASSES) def normalize(self, img_array): return (img_array - np.mean(img_array)) / np.std(img_array) * 16 + 64 def process_img(self, img_filepath, confidence, save_dirpath): # オリジナル with load_img(img_filepath) as img_orig: img_orig_array = img_to_array(img_orig) # オリジナル(解析用と同じ状態) img_orig_array_normalized = self.normalize(img_orig_array) # 解析用 with load_img(img_filepath, target_size=(self.image_height, self.image_width)) as img: img_array = img_to_array(img) img_array = self.normalize(img_array) img_array = np.expand_dims(img_array, axis=0) img_array = preprocess_input(img_array) preds = self.model.predict(img_array, batch_size=1, verbose=1) results = self.bbox_util.detection_out(preds) if len(results) <= 0: return det_label = results[0][:, 0] det_conf = results[0][:, 1] det_xmin = results[0][:, 2] det_ymin = results[0][:, 3] det_xmax = results[0][:, 4] det_ymax = results[0][:, 5] top_indices = [ i for i, conf in enumerate(det_conf) if conf >= confidence ] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] filename = os.path.basename(img_filepath) fname, ext = os.path.splitext(filename) for i in range(top_conf.shape[0]): label = int(top_label_indices[i]) label_name = self.voc_classes[label - 1] print('%s: %.8f' % (label_name, top_conf[i])) xmin = int(round(top_xmin[i] * img_orig_array.shape[1])) ymin = int(round(top_ymin[i] * img_orig_array.shape[0])) xmax = int(round(top_xmax[i] * img_orig_array.shape[1])) ymax = int(round(top_ymax[i] * img_orig_array.shape[0])) acc = top_conf[i] * 100 // 10 * 10 dir_name = '%s/%s/%02d_%02d' % (save_dirpath, label_name, acc, acc + 10) os.makedirs(dir_name, exist_ok=True) target_img_array = img_orig_array[ymin:ymax, xmin:xmax] with array_to_img(target_img_array) as target_img: target_img.save('%s/%s_%.8f.jpg' % (dir_name, fname, top_conf[i])) target_img_array_normalized = img_orig_array_normalized[ymin:ymax, xmin:xmax] with array_to_img( target_img_array_normalized) as target_img_normalized: target_img_normalized.save('%s/%s_%.8f_normalized.jpg' % (dir_name, fname, top_conf[i]))
def create_files_for_evaluation(args, n_images=200): NUM_CLASSES = 21 THRESHOLD = 0.6 CLASSES = [ 'Aeroplane', 'Bicycle', 'Bird', 'Boat', 'Bottle', 'Bus', 'Car', 'Cat', 'Chair', 'Cow', 'Diningtable', 'Dog', 'Horse', 'Motorbike', 'Person', 'Pottedplant', 'Sheep', 'Sofa', 'Train', 'Tvmonitor' ] with open(args.path_to_settings, 'r') as fp: sets = yaml.safe_load(fp) input_shape = (sets['img_height'], sets['img_width'], 3) priors = pickle.load( open( os.path.join( os.path.dirname(os.path.realpath(__file__)), 'priorFiles/prior_boxes_ssd300MobileNetV2_224_224.pkl'), 'rb')) np.set_printoptions(suppress=True) bbox_util = BBoxUtility(NUM_CLASSES, priors) config = tf.compat.v1.ConfigProto() inputs = [] images = [] result_detections = [] result_images = [] annotation_files = [] print('Prepare : {} files for evaluation. '.format(n_images)) with open( os.path.join(sets['dataset_dir'], 'VOC2007/ImageSets/Main/test.txt'), 'r') as annot_f: for annotation in tqdm(list(annot_f)[:n_images]): try: img_path = os.path.join( sets['dataset_dir'], 'VOC2007/JPEGImages/' ) + annotation.split(' ')[0].strip() + '.jpg' img = image.load_img(img_path, target_size=(input_shape[0], input_shape[1])) img = image.img_to_array(img) result_images.append(img_path) images.append(img) inputs.append(img.copy()) annotation_files.append(annotation) except Exception as e: print('Error while opening file.', e) with tf.compat.v1.Session(config=config) as s: tf_inference = restore_tf_checkpoint(sets, s, args.model_checkpoints) inputs = preprocess_input(np.array(inputs)) img_per_batch = 5 results = [] start_index = 0 print('Start computing batches') for end_index in tqdm( range(img_per_batch, inputs.shape[0] + 1, img_per_batch)): if not args.model_checkpoints: preds = tf_inference.predict(inputs[start_index:end_index, :]) else: preds = tf_inference['sess'].run( fetches=tf_inference['out'], feed_dict={ tf_inference['in']: inputs[start_index:end_index, :] }) results.extend(bbox_util.detection_out(preds)) start_index = end_index for i, img in tqdm(enumerate(images)): # Parse the outputs. det_label = results[i][:, 0] det_conf = results[i][:, 1] det_xmin = results[i][:, 2] det_ymin = results[i][:, 3] det_xmax = results[i][:, 4] det_ymax = results[i][:, 5] # Get detections with confidence higher than 0.6. top_indices = [ i for i, conf in enumerate(det_conf) if conf >= THRESHOLD ] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] detections = [] for i in range(top_conf.shape[0]): ''' xmin = int(round(top_xmin[i] * img.shape[1])) ymin = int(round(top_ymin[i] * img.shape[0])) xmax = int(round(top_xmax[i] * img.shape[1])) ymax = int(round(top_ymax[i] * img.shape[0])) ''' xmin = top_xmin[i] ymin = top_ymin[i] xmax = top_xmax[i] ymax = top_ymax[i] score = top_conf[i] label = int(top_label_indices[i]) label_name = CLASSES[label - 1] detections.append([ '{:.2f}'.format(xmin), '{:.2f}'.format(ymin), '{:.2f}'.format(xmax), '{:.2f}'.format(ymax), label_name, '{:.2f}'.format(score) ]) result_detections.append(detections) print('Test images: {}'.format(len(result_images))) model_predictions = [] MODEL_PREDICTION_PATH = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'model_evaluation/model_prediction/') predicted_images = [] for index, image_filename in tqdm(enumerate(result_images)): image_name = os.path.basename(image_filename) path_elements = image_name[:-4] predicted_images.append(image_name[:-4]) annot_dir = os.path.join(MODEL_PREDICTION_PATH) os.makedirs(annot_dir, exist_ok=True) annot_name = '{}.txt'.format(path_elements) annot_filename = os.path.join(annot_dir, annot_name) with open(annot_filename, 'w') as output_f: for d in result_detections[index]: left, top, right, botton, classe, score = d[0], d[1], d[ 2], d[3], d[4], d[5] model_predictions.append( (classe, score, left, top, right, botton)) output_f.write('{} {} {} {} {} {}\n'.format( classe, score, left, top, right, botton)) GROUND_TRUTH_LABELS = os.path.join(sets['dataset_dir'], 'VOC2007/Annotations') GROUND_TRUTH_PATH = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'model_evaluation/ground_truth/') for f in glob(GROUND_TRUTH_PATH + '*'): os.remove(f) filenames = os.listdir(GROUND_TRUTH_LABELS) ground_images = [] for filename in tqdm(filenames): if filename[:-4] not in predicted_images: continue ground_images.append(filename[:-4]) tree = ElementTree.parse( os.path.join(GROUND_TRUTH_LABELS + '/{}'.format(filename))) root = tree.getroot() bounding_boxes = [] one_hot_classes = [] size_tree = root.find('size') width = float(size_tree.find('width').text) height = float(size_tree.find('height').text) for object_tree in root.findall('object'): for bounding_box in object_tree.iter('bndbox'): xmin = float(bounding_box.find('xmin').text) / width ymin = float(bounding_box.find('ymin').text) / height xmax = float(bounding_box.find('xmax').text) / width ymax = float(bounding_box.find('ymax').text) / height class_name = object_tree.find('name').text.title() bounding_box = [class_name, xmin, ymin, xmax, ymax] bounding_boxes.append(bounding_box) with open( os.path.join(GROUND_TRUTH_PATH, filename.replace('xml', 'txt')), 'w+') as f: for p in bounding_boxes: f.write(' '.join([str(s) for s in p]) + "\n") print('Completed eval preparation') assert len(ground_images) == len(predicted_images)
class SSDPrecitor(PredictorBase): """ SSDの識別器 Arguments: modelfile: モデルファイルパス shape: SSD識別器に入力する際のモデルサイズ(width, height, channels). デフォルトは (300, 300, 3) num_classes: モデルの分類数. デフォルトは 21 conf_thresh: 検出結果の閾値 """ def __init__(self, modelfile, shape=(300, 300, 3), num_classes=21, conf_thresh=0.6): self.input_shape = shape self.num_classes = num_classes self.conf_thresh = conf_thresh # モデル作成 model = SSD(shape, num_classes=num_classes) model.load_weights(modelfile) self.model = model # バウンディングボックス作成ユーティリティ self.bbox_util = BBoxUtility(self.num_classes) def predict(self, src): """ SSDにより、入力画像からオブジェクトを識別する :param src: 入力画像 :return: ラベルID, スコア, Boxデータ(floatなので注意!!) """ height, width, channels = src.shape # 前処理 x = self._preprocess(src) # 推論 y = self.model.predict(x) # 後処理 results = self._decodebox(y) # 出力 if results.shape[0] > 0: results[:, 2] = results[:, 2] * width results[:, 3] = results[:, 3] * height results[:, 4] = results[:, 4] * width results[:, 5] = results[:, 5] * height return [(int(x[0]), x[1], x[2:6]) for x in results] return [] def _preprocess(self, src): """ 入力された画像に対して前処理を行う :param src: 入力画像 :return: 300x300にリサイズし、BGR->RGBに変換した画像 """ im_size = (self.input_shape[0], self.input_shape[1]) resized = cv2.resize(src, im_size) rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) inputs = [image.img_to_array(rgb)] return preprocess_input(np.array(inputs)) def _decodebox(self, preds): """ 識別処理後の後処理 :param preds: SSDモデルの識別結果 :return: confが閾値以上のボックスのみ抽出 """ conf_thresh = self.conf_thresh # ボックス抽出 **これで下記の情報が取得する** # 0: label # 1: conf # 2~5: bbox(xmin, ymin, xmax, ymax) box_results = self.bbox_util.detection_out(preds) result = np.array([]) if len(box_results) > 0 and len(box_results[0]) > 0: box_result = box_results[0] # スコアが閾値以上のデータのインデックスを取り出す top_indices = np.where(box_result[:, 1] > self.conf_thresh)[0] result = box_result[top_indices] return result else: return result
def test_on_video(model, name, experiment, videopath, outvideopath, classnames, batch_size=32, input_shape=(480, 640, 3), soft=False, width=480, height=640, conf_thresh=0.75, csv_conf_thresh=0.75): """ Applies a trained SSD model to a video Arguments: model -- the SSD model, e.g. from get_model name -- name of dataset experiment -- name of training run videopath -- path to input video outvideopath -- path to output video showing the detections classnames -- list of all the classes batch_size -- number of images processed in parallell, lower this if you get out-of-memory errors input_shape -- size of images fed to SSD soft -- Whether to do soft NMS or normal NMS width -- Width to scale detections with (can be set to 1 if detections are already on right scale) height -- Height to scale detections with (can be set to 1 if detections are already on right scale) conf_thresh -- Detections with confidences below this are not shown in output video. Set to negative to not visualize confidences. csv_conf_thresh -- Detections with confidences below this are ignored. This should be same as conf_thresh unless conf_thresh is negative. """ masker = Masker(name) num_classes = len(classnames) + 1 colors = class_colors(num_classes) make_vid = True suffix = outvideopath.split('.')[-1] if suffix == 'csv': make_vid = False csvpath = outvideopath else: csvpath = outvideopath.replace('.{}'.format(suffix), '.csv') print_flush('Generating priors') im_in = np.random.random( (1, input_shape[1], input_shape[0], input_shape[2])) priors = model.predict(im_in, batch_size=1)[0, :, -8:] bbox_util = BBoxUtility(num_classes, priors) vid = io.get_reader(videopath) if make_vid: outvid = io.get_writer(outvideopath, fps=30) inputs = [] frames = [] all_detections = [] for i, frame in enumerate(vid): frame = masker.mask(frame) resized = cv2.resize(frame, (input_shape[0], input_shape[1])) frames.append(frame.copy()) inputs.append(resized) if len(inputs) == batch_size: inputs = np.array(inputs).astype(np.float64) inputs = preprocess_input(inputs) preds = model.predict(inputs, batch_size=batch_size, verbose=0) results = bbox_util.detection_out(preds, soft=soft) for result, frame, frame_number in zip(results, frames, range(i - batch_size, i)): result = [ r if len(r) > 0 else np.zeros((1, 6)) for r in result ] raw_detections = pd.DataFrame(np.vstack(result), columns=[ 'class_index', 'confidence', 'xmin', 'ymin', 'xmax', 'ymax' ]) rescale(raw_detections, 'xmin', width) rescale(raw_detections, 'xmax', width) rescale(raw_detections, 'ymin', height) rescale(raw_detections, 'ymax', height) rescale(raw_detections, 'class_index', 1) ci = raw_detections['class_index'] cn = [classnames[int(x) - 1] for x in ci] raw_detections['class_name'] = cn raw_detections['frame_number'] = (frame_number + 2) all_detections.append(raw_detections[ raw_detections.confidence > csv_conf_thresh]) if make_vid: frame = draw(frame, raw_detections, colors, conf_thresh=conf_thresh) outvid.append_data(frame) frames = [] inputs = [] if i % (10 * batch_size) == 0: print_flush(i) detections = pd.concat(all_detections) detections.to_csv(csvpath)
class TLClassifier(object): def __init__(self): NUM_CLASSES = 3 + 1 input_shape = (300, 300, 3) #config_string = rospy.get_param("/traffic_light_config") #self.config = yaml.load(config_string) #self.stop_line_positions = self.config['stop_line_positions'] # get path to resources #path_to_resources = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..', '..', '..', 'tlc') # "prior boxes" in the paper #priors = pickle.load(open(os.path.join(path_to_resources, 'prior_boxes_ssd300.pkl'), 'rb')) priors = pickle.load(open('prior_boxes_ssd300.pkl', 'rb')) self.bbox_util = BBoxUtility(NUM_CLASSES, priors) # Traffic Light Classifier model and its weights self.model = SSD300(input_shape, num_classes=NUM_CLASSES) #self.model.load_weights(os.path.join(path_to_resources, self.config['classifier_weights_file']), by_name=True) #self.model.load_weights('weights.180314.hdf5', by_name=True) self.model.load_weights('checkpoints/weights.07-0.70.hdf5', by_name=True) # prevent TensorFlow's ValueError when no raised backend dummy = np.zeros((1, 300, 300, 3)) _ = self.model.predict(dummy, batch_size=1, verbose=0) # prevent TensorFlow's ValueError when no raised backend dummy = np.zeros((1, 300, 300, 3)) _ = self.model.predict(dummy, batch_size=1, verbose=0) self.is_in_progress = False self.last_result = TrafficLight.UNKNOWN def get_classification(self, img): """Determines the color of the traffic light in the image Args: img (cv::Mat): image containing the traffic light assumed 3D numpy.array (800, 600, 3) with bgr8: CV_8UC3, color image Returns: int: ID of traffic light color (specified in styx_msgs/TrafficLight) """ #if self.is_in_progress: # return self.last_result, 0, 0, 0, 0, 0 #self.is_in_progress = True # adjust img arg for the model pilImg = Image.fromarray(np.uint8(img)).resize((300, 300)) img = np.array(pilImg) img = image.img_to_array(img) inputs = np.reshape(img, (1, 300, 300, 3)) # 'inputs' expects this size # prediction inputs = preprocess_input(np.array(inputs)) preds = self.model.predict(inputs, batch_size=1, verbose=0) results = self.bbox_util.detection_out(preds) if results == None or results == [] or results == [[]]: self.last_result = TrafficLight.UNKNOWN self.is_in_progress = False return self.last_result, 0, 0, 0, 0, 0 det_label = results[0][:, 0] det_conf = results[0][:, 1] det_xmin = results[0][:, 2] det_ymin = results[0][:, 3] det_xmax = results[0][:, 4] det_ymax = results[0][:, 5] # Get detections top_indices = [j for j, conf in enumerate(det_conf) if conf >= 0.6] top_label_indices = det_label[top_indices].tolist() if top_label_indices == []: return TrafficLight.UNKNOWN, 0, 0, 0, 0, 0 top_conf = det_conf[top_indices] top_xmin = det_xmin[top_indices][0] top_ymin = det_ymin[top_indices][0] top_xmax = det_xmax[top_indices][0] top_ymax = det_ymax[top_indices][0] score = top_conf[0] # return the first signal detected if top_label_indices == []: self.last_result = TrafficLight.UNKNOWN, 0, 0, 0, 0, 0 self.is_in_progress = False return self.last_result, 0, 0, 0, 0, 0 label = int(top_label_indices[0]) #print "Found label " + str(label) + " at " + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") if label == 0: return TrafficLight.UNKNOWN, 0, 0, 0, 0, 0 elif label == 1: return TrafficLight.RED, score, top_xmin, top_ymin, top_xmax, top_ymax elif label == 2: return TrafficLight.YELLOW, score, top_xmin, top_ymin, top_xmax, top_ymax elif label == 3: return TrafficLight.GREEN, score, top_xmin, top_ymin, top_xmax, top_ymax else: return TrafficLight.UNKNOWN, score, top_xmin, top_ymin, top_xmax, top_ymax self.is_in_progress = False return self.last_result, 0, 0, 0, 0, 0
def main(dataset, run, input_shape, seq_start, seq_stop, videopath, conf_thresh, i_seq, outname, batch_size): print_flush("> Predicting...") classes = get_classnames(dataset) masker = Masker(dataset) input_shape = parse_resolution(input_shape) num_classes = len(classes) + 1 model = get_model(dataset, run, input_shape, num_classes, verbose=False) priors = get_priors(model, input_shape) bbox_util = BBoxUtility(num_classes, priors) width = input_shape[0] height = input_shape[1] inputs = [] outputs = [] old_frame = None with io.get_reader(videopath) as vid: vlen = len(vid) for i_in_seq in range(seq_start, seq_stop): if i_in_seq < vlen: frame = vid.get_data(i_in_seq) frame = masker.mask(frame) old_frame = frame else: frame = old_frame resized = cv2.resize(frame, (width, height)) inputs.append(resized) if len(inputs) == batch_size: inputs2 = np.array(inputs) inputs2 = inputs2.astype(np.float32) inputs2 = preprocess_input(inputs2) y = model.predict_on_batch(inputs2) outputs.append(y) inputs = [] preds = np.vstack(outputs) print_flush("> Processing...") all_detections = [] seq_len = seq_stop - seq_start for i in range(seq_len): frame_num = i + seq_start if frame_num < vlen: pred = preds[i, :] pred = pred.reshape(1, pred.shape[0], pred.shape[1]) results = bbox_util.detection_out(pred, soft=False) detections = process_results(results, width, height, classes, conf_thresh, frame_num) all_detections.append(detections) dets = pd.concat(all_detections) # For the first line, we should open in write mode, and then in append mode # This way, we still overwrite the files if this script is run multiple times open_mode = 'a' include_header = False if i_seq == 0: open_mode = 'w' include_header = True print_flush("> Writing to {} ...".format(outname)) with open(outname, open_mode) as f: dets.to_csv(f, header=include_header)
class PPM: cars = 0 model = None img_path = '/tmp/ppm.jpg' bbox_util = None conf_limit = 0.6 def __init__(self, conf_limit=0.6): self.conf_limit = conf_limit np.set_printoptions(suppress=True) config = tf.ConfigProto() #config.gpu_options.per_process_gpu_memory_fraction = 0.45 set_session(tf.Session(config=config)) self.voc_classes = [ 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor' ] NUM_CLASSES = len(self.voc_classes) + 1 self.bbox_util = BBoxUtility(NUM_CLASSES) input_shape = (300, 300, 3) self.model = SSD300(input_shape, num_classes=NUM_CLASSES) self.model.load_weights('weights_SSD300.hdf5', by_name=True) def read_cars(self): inputs = [] images = [] img = image.load_img(self.img_path, target_size=(300, 300)) img = image.img_to_array(img) images.append(imread(self.img_path)) inputs.append(img.copy()) inputs = preprocess_input(np.array(inputs)) preds = self.model.predict(inputs, batch_size=1, verbose=0) results = self.bbox_util.detection_out(preds) if results == None or len(results[0]) == 0: return 0 i = 0 img = images[0] # Parse the outputs. det_label = results[i][:, 0] det_conf = results[i][:, 1] det_xmin = results[i][:, 2] det_ymin = results[i][:, 3] det_xmax = results[i][:, 4] det_ymax = results[i][:, 5] # Get detections with confidence higher than conf_limit. top_indices = [ i for i, conf in enumerate(det_conf) if conf >= self.conf_limit ] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] cars = 0 for i in range(top_conf.shape[0]): xmin = int(round(top_xmin[i] * img.shape[1])) ymin = int(round(top_ymin[i] * img.shape[0])) xmax = int(round(top_xmax[i] * img.shape[1])) ymax = int(round(top_ymax[i] * img.shape[0])) score = top_conf[i] label = int(top_label_indices[i]) label_name = self.voc_classes[label - 1] print('Label: ' + label_name) if label_name == 'car': cars += 1 display_txt = '{:0.2f}, {}'.format(score, label_name) coords = (xmin, ymin), xmax - xmin + 1, ymax - ymin + 1 return cars
class TLClassifier(object): def __init__(self): NUM_CLASSES = 3 + 1 input_shape = (300, 300, 3) config_string = rospy.get_param("/traffic_light_config") self.config = yaml.load(config_string) self.stop_line_positions = self.config['stop_line_positions'] # get path to resources path_to_resources = os.path.join( os.path.dirname(os.path.abspath(__file__)), '..', '..', '..', '..', 'tlc') # "prior boxes" in the paper priors = pickle.load( open(os.path.join(path_to_resources, 'prior_boxes_ssd300.pkl'), 'rb')) self.bbox_util = BBoxUtility(NUM_CLASSES, priors) # Traffic Light Classifier model and its weights self.model = SSD300(input_shape, num_classes=NUM_CLASSES) print(self.model.summary()) self.model.load_weights(os.path.join( path_to_resources, self.config['classifier_weights_file']), by_name=True) # prevent TensorFlow's ValueError when no raised backend dummy = np.zeros((1, 300, 300, 3)) _ = self.model.predict(dummy, batch_size=1, verbose=0) # prevent TensorFlow's ValueError when no raised backend dummy = np.zeros((1, 300, 300, 3)) _ = self.model.predict(dummy, batch_size=1, verbose=0) self.capture_images = False self.image_counts = {0: 0, 1: 0, 2: 0, 4: 0} self.last_classification = None def get_classification(self, imgInput, light_state): """Determines the color of the traffic light in the image Args: img (cv::Mat): image containing the traffic light assumed 3D numpy.array (800, 600, 3) with bgr8: CV_8UC3, color image Returns: int: ID of traffic light color (specified in styx_msgs/TrafficLight) """ # adjust img arg for the model pilImg = Image.fromarray(np.uint8(imgInput)).resize((300, 300)) img = np.array(pilImg) img = image.img_to_array(img) inputs = np.reshape(img, (1, 300, 300, 3)) # 'inputs' expects this size # prediction inputs = preprocess_input(np.array(inputs)) preds = self.model.predict(inputs, batch_size=1, verbose=0) results = self.bbox_util.detection_out(preds) if results == None or results == [] or results == [[]]: if self.last_classification is not None: if (datetime.datetime.utcnow() - self.last_classification[1]).total_seconds() > 3.5: self.last_classification = None else: return self.last_classification[0] self.save_image(imgInput, light_state) return TrafficLight.UNKNOWN det_label = results[0][:, 0] det_conf = results[0][:, 1] # Get detections with confidence >= 0.8 top_indices = [j for j, conf in enumerate(det_conf) if conf >= 0.8] top_label_indices = det_label[top_indices].tolist() # return the first signal detected if top_label_indices == []: if self.last_classification is not None: if (datetime.datetime.utcnow() - self.last_classification[1]).total_seconds() > 3.5: self.last_classification = None else: return self.last_classification[0] self.save_image(imgInput, light_state) return TrafficLight.UNKNOWN label = int(top_label_indices[0]) #print "Found label " + str(label) + " at " + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") if label == 1: self.last_classification = (TrafficLight.RED, datetime.datetime.utcnow()) elif label == 2: self.last_classification = (TrafficLight.YELLOW, datetime.datetime.utcnow()) elif label == 3: self.last_classification = (TrafficLight.GREEN, datetime.datetime.utcnow()) else: if self.last_classification is not None: if (datetime.datetime.utcnow() - self.last_classification[1]).total_seconds() > 3.5: self.last_classification = None else: return self.last_classification[0] return TrafficLight.UNKNOWN return self.last_classification[0] def save_image(self, image, state): if self.capture_images and self.image_counts[state] < 100: # Save to disk path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "capture", str(state)) if (not os.path.isdir(path)): os.makedirs(path) name = str(time.time()) + '.png' cv2.imwrite(os.path.join(path, name), image) self.image_counts[state] += 1
def run_camera(input_shape, model, video_path, image_path_ori, image_path_crop): num_classes = 21 conf_thresh = 0.5 input_shape = input_shape bbox_util = BBoxUtility(num_classes) class_colors = [] for i in range(0, num_classes): hue = 255 * i / num_classes col = np.zeros((1, 1, 3)).astype("uint8") col[0][0][0] = hue col[0][0][1] = 128 # Saturation col[0][0][2] = 255 # Value cvcol = cv2.cvtColor(col, cv2.COLOR_HSV2BGR) col = (int(cvcol[0][0][0]), int(cvcol[0][0][1]), int(cvcol[0][0][2])) class_colors.append(col) class_list = os.listdir(video_path) for action in class_list: all_action = os.listdir(video_path + action) for sample in all_action: print(video_path + action + '/' + sample) name = sample.split('.')[0] if not os.path.exists(image_path_ori + action + '/' + name): os.mkdir(image_path_ori + action + '/' + name) if not os.path.exists(image_path_crop + action + '/' + name): os.mkdir(image_path_crop + action + '/' + name) vid = cv2.VideoCapture(video_path + action + '/' + sample) # Compute aspect ratio of video vidw = vid.get(cv2.cv.CV_CAP_PROP_FRAME_WIDTH) vidh = vid.get(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT) frame_length = vid.get(cv2.cv.CV_CAP_PROP_FRAME_COUNT) # vidar = vidw / vidh frame_count = 0 for n in range(int(frame_length)): retval, orig_image = vid.read() if not retval: print("Done!") return im_size = (input_shape[0], input_shape[1]) resized = cv2.resize(orig_image, im_size) rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) inputs = [image.img_to_array(rgb)] tmp_inp = np.array(inputs) x = preprocess_input(tmp_inp) y = model.predict(x) results = bbox_util.detection_out(y) if len(results) > 0 and len(results[0]) > 0: det_label = results[0][:, 0] det_conf = results[0][:, 1] det_xmin = results[0][:, 2] det_ymin = results[0][:, 3] det_xmax = results[0][:, 4] det_ymax = results[0][:, 5] top_indices = [ i for i, conf in enumerate(det_conf) if conf >= conf_thresh ] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] if 15 not in top_label_indices: pass else: for i in range(top_conf.shape[0]): xmin = int(round((top_xmin[i] * vidw) * 0.9)) ymin = int(round((top_ymin[i] * vidh) * 0.9)) xmax = int(round( (top_xmax[i] * vidw) * 1.1)) if int(round( (top_xmax[i] * vidw) * 1.1)) <= vidw else int( round(top_xmax[i] * vidw)) ymax = int(round( (top_ymax[i] * vidh) * 1.1)) if int(round( (top_ymax[i] * vidh) * 1.1)) <= vidh else int( round(top_ymax[i] * vidh)) # save frames class_num = int(top_label_indices[i]) if class_num == 15: frame = cv2.cvtColor(orig_image, cv2.COLOR_BGR2GRAY) cv2.imwrite( image_path_ori + action + '/' + name + str(10000 + frame_count) + '.jpg', frame) cropImage = frame[ymin:ymax, xmin:xmax] cropImage = cv2.resize(cropImage, (64, 64)) cv2.imwrite( image_path_crop + action + '/' + name + str(10000 + frame_count) + '.jpg', cropImage) frame_count += 1
def objct_recognition(self, img_paths): """ ssdを用いた物体検出 """ """各種設定""" self.img_paths = img_paths plt.rcParams['figure.figsize'] = (8, 8) # グラフサイズ(インチ # http://d.hatena.ne.jp/nishiohirokazu/20111121/1321849806 plt.rcParams['image.interpolation'] = 'nearest' # 補完アルゴル設定 np.set_printoptions(suppress=True) # 出力の圧縮=Trues config = tf.ConfigProto() # プロセス辺りのGPU占有率 config.gpu_options.per_process_gpu_memory_fraction = 0.45 set_session(tf.Session(config=config)) """各種設定""" """認識クラス一覧""" voc_classes = [ 'Aeroplane', 'Bicycle', 'Bird', 'Boat', 'Bottle', 'Bus', 'Car', 'Cat', 'Chair', 'Cow', 'Diningtable', 'Dog', 'Horse', 'Motorbike', 'Person', 'Pottedplant', 'Sheep', 'Sofa', 'Train', 'Tvmonitor' ] NUM_CLASSES = len(voc_classes) + 1 """認識クラス一覧""" """認識関連の設定・読み込み""" input_shape = (300, 300, 3) # 入力画像サイズ model = SSD300(input_shape, num_classes=NUM_CLASSES) # モデル確保 model.load_weights('weights_SSD300.hdf5', by_name=True) # モデル読み込み bbox_util = BBoxUtility(NUM_CLASSES) # バウンディングボックスクラス? """認識関連の設定・読み込み""" """画像読み込み""" inputs = [] images = [] for img_path in self.img_paths: img = image.load_img(img_path, target_size=(300, 300)) # 画像読み込み img = image.img_to_array(img) # キャスト images.append(imread(img_path)) inputs.append(img.copy()) """画像読み込み""" inputs = preprocess_input(np.array(inputs)) # 前処理 preds = model.predict(inputs, batch_size=1, verbose=1) # 認識 self.results = bbox_util.detection_out(preds) # 結果クラス?生成 # 入力画像ごとループ for i_, img in enumerate(images): # shapesの更新 self.shapes.append([img.shape[1], img.shape[0]]) # 結果クラスを分割 if self.results[i_] == []: # 認識結果ない場合、以降の処理しない continue det_label = self.results[i_][:, 0] det_conf = self.results[i_][:, 1] det_xmin = self.results[i_][:, 2] det_ymin = self.results[i_][:, 3] det_xmax = self.results[i_][:, 4] det_ymax = self.results[i_][:, 5] # 信頼度0.6以上を取得 top_indices = [ i_ for i_, conf in enumerate(det_conf) if conf >= 0.6 ] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist() plt.imshow(img / 255.) # 画像描画 currentAxis = plt.gca() # 描画(認識結果ごとループ) for i in range(top_conf.shape[0]): xmin = int(round(top_xmin[i] * img.shape[1])) ymin = int(round(top_ymin[i] * img.shape[0])) xmax = int(round(top_xmax[i] * img.shape[1])) ymax = int(round(top_ymax[i] * img.shape[0])) score = top_conf[i] label = int(top_label_indices[i]) label_name = voc_classes[label - 1] display_txt = '{:0.2f}, {}'.format(score, label_name) coords = (xmin, ymin), xmax - xmin + 1, ymax - ymin + 1 color = colors[label] currentAxis.add_patch( plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2)) currentAxis.text(xmin, ymin, display_txt, bbox={ 'facecolor': color, 'alpha': 0.5 }) # print(xmin, ymin, xmax, ymax) # 保存 #plt.savefig("./data/recognition_imgs/"+os.path.basename(img_paths[i_])) plt.savefig(os.path.basename(img_paths[i_])) plt.close() import gc gc.collect() # メモリ解放
with tf.compat.v1.Session(config=config) as s: tf_inference = restore_tf_checkpoint(sets, s) inputs = preprocess_input(np.array(inputs)) img_per_batch = 5 results = [] start_index = 0 for end_index in tqdm( range(img_per_batch, inputs.shape[0] + 1, img_per_batch)): preds = tf_inference['sess'].run( fetches=tf_inference['out'], feed_dict={ tf_inference['in']: inputs[start_index:end_index, :] }) results.extend(bbox_util.detection_out(preds)) start_index = end_index for i, img in tqdm(enumerate(images)): # Parse the outputs. det_label = results[i][:, 0] det_conf = results[i][:, 1] det_xmin = results[i][:, 2] det_ymin = results[i][:, 3] det_xmax = results[i][:, 4] det_ymax = results[i][:, 5] # Get detections with confidence higher than 0.6. top_indices = [ i for i, conf in enumerate(det_conf) if conf >= THRESHOLD ]
def predict(model, img): inputs = [] plt.cla() img = image.img_to_array(img) img = np.asarray(img) inputs.append(img.copy()) inputs = np.asarray(inputs) inputs = preprocess_input(inputs) preds = model.predict(inputs, batch_size=1, verbose=1) bbox_util = BBoxUtility(NUM_CLASSES) results = bbox_util.detection_out(preds) # Parse the outputs. det_label = results[0][:, 0] det_conf = results[0][:, 1] det_xmin = results[0][:, 2] det_ymin = results[0][:, 3] det_xmax = results[0][:, 4] det_ymax = results[0][:, 5] top_indices = [i for i, conf in enumerate(det_conf) if conf >= 0.6] #0.6 top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist() plt.imshow(img / 255.) currentAxis = plt.gca() money_total = 0 money_num_list = [10, 100, 5] for i in range(top_conf.shape[0]): xmin = int(round(top_xmin[i] * img.shape[1])) ymin = int(round(top_ymin[i] * img.shape[0])) xmax = int(round(top_xmax[i] * img.shape[1])) ymax = int(round(top_ymax[i] * img.shape[0])) score = top_conf[i] label = int(top_label_indices[i]) label_name = voc_classes[label - 1] display_txt = '{:0.2f}, {}'.format(score, label_name) coords = (xmin, ymin), xmax - xmin + 1, ymax - ymin + 1 color = colors[label] currentAxis.add_patch( plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2)) currentAxis.text(xmin, ymin, display_txt, bbox={ 'facecolor': color, 'alpha': 0.5 }) money_total = money_total + money_num_list[label - 1] plt.title(f'Total:{money_total} yen') canvas = FigureCanvasAgg(currentAxis.figure) buf = io.BytesIO() plt.savefig(buf) buf.seek(0) return buf
def run_camera(input_shape, model): num_classes = 21 conf_thresh = 0.5 bbox_util = BBoxUtility(num_classes) vid = cv2.VideoCapture(0) sleep(1.0) # Compute aspect ratio of video vidw = vid.get(cv2.CAP_PROP_FRAME_WIDTH) vidh = vid.get(cv2.CAP_PROP_FRAME_HEIGHT) trackers = Tracker() while True: ret, origin_image = vid.read() frame = origin_image if not ret: print("Done!") return None im_size = (input_shape[0], input_shape[1]) resized = cv2.resize(frame, im_size) rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) inputs = [image.img_to_array(rgb)] tmp_inp = np.array(inputs) x = preprocess_input(tmp_inp) y = model.predict(x) results = bbox_util.detection_out(y) if len(results) > 0 and len(results[0]) > 0: det_label = results[0][:, 0] det_conf = results[0][:, 1] det_xmin = results[0][:, 2] det_ymin = results[0][:, 3] det_xmax = results[0][:, 4] det_ymax = results[0][:, 5] top_indices = [i for i, conf in enumerate(det_conf) if conf >= conf_thresh] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] if 15 not in top_label_indices: pass else: trackers.bbox = [] trackers.features_current = [] trackers.index = [] for i in range(top_conf.shape[0]): class_num = int(top_label_indices[i]) if class_num == 15: xmin = int(round((top_xmin[i] * vidw) * 0.9)) ymin = int(round((top_ymin[i] * vidh) * 0.9)) xmax = int(round((top_xmax[i] * vidw) * 1.1)) if int(round( (top_xmax[i] * vidw)) * 1.1) <= vidw else int(round( top_xmax[i] * vidw)) ymax = int(round((top_ymax[i] * vidh) * 1.1)) if int(round( (top_ymax[i] * vidh) * 1.1)) <= vidh else int(round(top_ymax[i] * vidh)) trackers.bbox.append([xmin, ymin, xmax, ymax]) trackers.features_current.append( Extract_feature(cv2.resize(frame[ymin:ymax, xmin:xmax, :], (32, 32)))) if trackers.features_previous is None: trackers.index.append(i for i in range(len(trackers.bbox))) for j in range(len(trackers.features_current)): cv2.rectangle(frame, (int(trackers.bbox[j][0]), int(trackers.bbox[j][1])), (int(trackers.bbox[j][2]), int(trackers.bbox[j][3])), (255, 0, 0), 2) cv2.putText(frame, "person: {}".format(trackers.index[j] + 1), (trackers.bbox[j][0] + 10, trackers.bbox[j][1] + 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 1) else: trackers.match() trackers.update() for j in range(len(trackers.features_current)): cv2.rectangle(frame, (int(trackers.bbox[j][0]), int(trackers.bbox[j][1])), (int(trackers.bbox[j][2]), int(trackers.bbox[j][3])), (255, 0, 0), 2) cv2.putText(frame, "person: {}".format(trackers.index[j] + 1), (trackers.bbox[j][0] + 10, trackers.bbox[j][1] + 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 1) cv2.imshow('tracking', frame) if cv2.waitKey(5) & 0xFF == ord('q'): break
img_path = './pics/car_cat2.jpg' img = image.load_img(img_path, target_size=(300, 300)) img = image.img_to_array(img) images.append(imread(img_path)) inputs.append(img.copy()) inputs = preprocess_input(np.array(inputs)) # In[5]: preds = model.predict(inputs, batch_size=1, verbose=1) # In[6]: results = bbox_util.detection_out(preds) # In[8]: for i, img in enumerate(images): # Parse the outputs. det_label = results[i][:, 0] det_conf = results[i][:, 1] det_xmin = results[i][:, 2] det_ymin = results[i][:, 3] det_xmax = results[i][:, 4] det_ymax = results[i][:, 5] # Get detections with confidence higher than 0.6. top_indices = [i for i, conf in enumerate(det_conf) if conf >= 0.6]
class SSDPipeline(object): def __init__(self): voc_classes = ['Aeroplane', 'Bicycle', 'Bird', 'Boat', 'Bottle', 'Bus', 'Car', 'Cat', 'Chair', 'Cow', 'Diningtable', 'Dog', 'Horse','Motorbike', 'Person', 'Pottedplant', 'Sheep', 'Sofa', 'Train', 'Tvmonitor'] NUM_CLASSES = len(voc_classes) + 1 input_shape=(300, 300, 3) self.model = SSD300(input_shape, num_classes=NUM_CLASSES) weights_file = "./checkpoints/weights.10-2.85.hdf5" #weights_file = "./checkpoints/weights.39-1.61_ubuntu.hdf5" self.model.load_weights(weights_file, by_name=True) self.bbox_util = BBoxUtility(NUM_CLASSES) def loadImage(self,video_path): vid = cv2.VideoCapture(video_path) vidw = vid.get(3) # CV_CAP_PROP_FRAME_WIDTH vidh = vid.get(4) # CV_CAP_PROP_FRAME_HEIGHT print(vidw,vidh) input_shape = (300,300,3) vidar = vidw/vidh #print(vidar) return vidar def setClassColors(self): self.class_colors = [] self.class_names = ["background", "Prescription", "None", "title", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]; NUM_CLASSES = len(self.class_names) for i in range(0, NUM_CLASSES): # This can probably be written in a more elegant manner hue = 255*i/NUM_CLASSES col = np.zeros((1,1,3)).astype("uint8") col[0][0][0] = hue col[0][0][1] = 128 # Saturation col[0][0][2] = 255 # Value cvcol = cv2.cvtColor(col, cv2.COLOR_HSV2BGR) col = (int(cvcol[0][0][0]), int(cvcol[0][0][1]), int(cvcol[0][0][2])) self.class_colors.append(col) def pipeline(self,orig_image): start_frame = 0 # this is manual adjustment parameter # For binary classifilcation, set higher threshhold rather than 0.5 conf_thresh = 0.50 accum_time = 0 curr_fps = 0 fps = "FPS: ??" prev_time = timer() vidh, vidw, _ = orig_image.shape vidar = vidw/vidh input_shape = (300,300,3) display_shape = (600,600,3) im_size = (input_shape[0], input_shape[1]) resized = cv2.resize(orig_image, im_size) to_draw = cv2.resize(resized, (int(input_shape[0]*vidar), input_shape[1])) #to_draw = cv2.resize(resized, (int(display_shape[0]*vidar), display_shape[1])) #to_draw = orig_image.copy() # Use model to predict inputs = [image.img_to_array(resized)] tmp_inp = np.array(inputs) x = preprocess_input(tmp_inp) y = self.model.predict(x) #preds = model.predict(inputs, batch_size=1, verbose=1) results = self.bbox_util.detection_out(y) if len(results) > 0 and len(results[0]) > 0: # Interpret output, only one frame is used det_label = results[0][:, 0] det_conf = results[0][:, 1] det_xmin = results[0][:, 2] det_ymin = results[0][:, 3] det_xmax = results[0][:, 4] det_ymax = results[0][:, 5] top_indices = [i for i, conf in enumerate(det_conf) if conf >= conf_thresh] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] classes = [] probs = [] for i in range(top_conf.shape[0]): xmin = int(round(top_xmin[i] * to_draw.shape[1])) ymin = int(round(top_ymin[i] * to_draw.shape[0])) xmax = int(round(top_xmax[i] * to_draw.shape[1])) ymax = int(round(top_ymax[i] * to_draw.shape[0])) # Draw the box on top of the to_draw image class_num = int(top_label_indices[i]) # sorry, but x length bigger than half of screen size avoid to # draw rectangle if ( abs(xmax-xmin) > to_draw.shape[1] / 2. ): continue classes.append(self.class_names[class_num]) probs.append(top_conf[i]) cv2.rectangle(to_draw, (xmin, ymin), (xmax, ymax), self.class_colors[class_num], 2) text = self.class_names[class_num] + " " + ('%.2f' % top_conf[i]) text_top = (xmin, ymin-10) text_bot = (xmin + 80, ymin + 5) text_pos = (xmin + 5, ymin) cv2.rectangle(to_draw, text_top, text_bot, self.class_colors[class_num], -1) cv2.putText(to_draw, text, text_pos, cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0,0,0), 1) # Calculate FPS # This computes FPS for everything, not just the model's execution # which may or may not be what you want #curr_time = timer() #exec_time = curr_time - prev_time #prev_time = curr_time #accum_time = accum_time + exec_time #curr_fps = curr_fps + 1 #if accum_time > 1: # accum_time = accum_time - 1 # fps = "FPS: " + str(curr_fps) # curr_fps = 0 # Draw FPS in top left corner #cv2.rectangle(to_draw, (0,0), (50, 17), (255,255,255), -1) #cv2.putText(to_draw, fps, (3,10), cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0,0,0), 1) #print("object NO:", i+1) #print("rectangle info: ", coords) return to_draw, classes, probs
def frames(): video_path = 0 start_frame = 0 conf_thresh = 0.6 input_shape = (480,300,3) class_names = ["background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"] NUM_CLASSES = len(class_names) num_classes=NUM_CLASSES class_colors = [] for i in range(0, num_classes): hue = 255*i/num_classes col = np.zeros((1,1,3)).astype("uint8") col[0][0][0] = hue col[0][0][1] = 128 # Saturation col[0][0][2] = 255 # Value cvcol = cv2.cvtColor(col, cv2.COLOR_HSV2BGR) col = (int(cvcol[0][0][0]), int(cvcol[0][0][1]), int(cvcol[0][0][2])) class_colors.append(col) bbox_util = BBoxUtility(num_classes) model = SSD(input_shape, num_classes=NUM_CLASSES) model.load_weights('weights_SSD300.hdf5') INTERVAL= 33 # 待ち時間 FRAME_RATE = 20 # fps ORG_WINDOW_NAME = "org" #GRAY_WINDOW_NAME = "gray" #OUT_FILE_NAME = "real_SSD_result.mp4" vid = cv2.VideoCapture(Camera.video_source) width, height = input_shape[0], input_shape[1] #input_shape """ out = cv2.VideoWriter(OUT_FILE_NAME, \ cv_fourcc('M', 'P', '4', 'V'), \ FRAME_RATE, \ (width, height), \ True) """ if not vid.isOpened(): raise IOError(("Couldn't open video file or webcam. If you're " "trying to open a webcam, make sure you video_path is an integer!")) vidw = vid.get(cv2.CAP_PROP_FRAME_WIDTH) vidh = vid.get(cv2.CAP_PROP_FRAME_HEIGHT) vidar = vidw/vidh """ if start_frame > 0: vid.set(cv2.CAP_PROP_POS_MSEC, start_frame) """ accum_time = 0 curr_fps = 0 fps = "FPS: ??" prev_time = timer() start_time=prev_time #cv2.namedWindow(ORG_WINDOW_NAME) while True: retval, orig_image = vid.read() if not retval: print("Done!") return im_size = (input_shape[1], input_shape[0]) resized = cv2.resize(orig_image, im_size) rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) to_draw = cv2.resize(resized, (int(input_shape[1]*vidar), input_shape[0])) inputs = [image.img_to_array(rgb)] #rgb tmp_inp = np.array(inputs) x = preprocess_input(tmp_inp) y = model.predict(x) results = bbox_util.detection_out(y) if len(results) > 0 and len(results[0]) > 0: det_label = results[0][:, 0] det_conf = results[0][:, 1] det_xmin = results[0][:, 2] det_ymin = results[0][:, 3] det_xmax = results[0][:, 4] det_ymax = results[0][:, 5] top_indices = [i for i, conf in enumerate(det_conf) if conf >= conf_thresh] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] for i in range(top_conf.shape[0]): xmin = int(round(top_xmin[i] * to_draw.shape[1])) ymin = int(round(top_ymin[i] * to_draw.shape[0])) xmax = int(round(top_xmax[i] * to_draw.shape[1])) ymax = int(round(top_ymax[i] * to_draw.shape[0])) class_num = int(top_label_indices[i]) cv2.rectangle(to_draw, (xmin, ymin), (xmax, ymax), class_colors[class_num], 2) #to_draw text = class_names[class_num] + " " + ('%.2f' % top_conf[i]) text_top = (xmin, ymin-10) text_bot = (xmin + 80, ymin + 5) text_pos = (xmin + 5, ymin) cv2.rectangle(to_draw, text_top, text_bot, class_colors[class_num], -1) #to_draw cv2.putText(to_draw, text, text_pos, cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0,0,0), 1) #to_draw print(text," ") curr_time = timer() exec_time = curr_time - prev_time prev_time = curr_time accum_time = accum_time + exec_time curr_fps = curr_fps + 1 if accum_time > 1: accum_time = accum_time - 1 fps = "FPS: " + str(curr_fps) curr_fps = 0 cv2.rectangle(to_draw, (0,0), (50, 17), (255,255,255), -1) #to_draw cv2.putText(to_draw, fps, (3,10), cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0,0,0), 1) #to_draw #yield cv2.imencode('.jpg', to_draw)[1].tobytes() to_draw = cv2.resize(to_draw, (int(input_shape[0]*1), input_shape[1])) #cv2.imshow(ORG_WINDOW_NAME, to_draw) #to_draw #out.write(to_draw) #add to_draw if cv2.waitKey(INTERVAL)>= 0: # & 0xFF == ord('q'): break #elif curr_time-start_time>=60: # break yield cv2.imencode('.jpg', to_draw)[1].tobytes() vid.release() #add #out.release() #add cv2.destroyAllWindows() #add
class Detectors: def __init__(self): #顔検出モデルと年齢・性別検出モデルを復元 self.age_detector = load_model("transfer_Xception_29.h5") NUM_CLASSES = 2 input_shape = (300, 300, 3) priors = pickle.load(open('prior_boxes_ssd300.pkl', 'rb')) self.bbox_util = BBoxUtility(NUM_CLASSES, priors) self.face_detector = SSD300(input_shape, num_classes=NUM_CLASSES) self.face_detector.load_weights('weights.05-3.15.hdf5', by_name=True) def age_detect(self, input): #先頭にNUMの次元が必要なので追加 input_add = input age_predict = self.age_detector.predict(input_add) # 年齢をsigmoidの出力(0〜1)から元に戻す(1を116歳にしている) age = np.round(age_predict[0]*116).astype(int) # 性別 gender = np.zeros([age_predict[1].shape[0],1],dtype=str) for i in range(age_predict[1].shape[0]): # 性別は[0.2,0.8]ならF , [0.6,0.4]ならM のように判定 if 0.5 <= age_predict[1][i][0]: gender[i] = 'M' else: gender[i] = 'F' return age, gender #リターンをarray形式で統一 def face_detect(self, img_path, display=False): inputs, images, resize_imgs, bb_coordinate = [], [], [], [] img = image.load_img(img_path, target_size=(300, 300)) img = image.img_to_array(img) if '/' in img_path: img_original = image.load_img(img_path) img_original = image.img_to_array(img_original) else: # s3から取得した場合 img_original = np.array(image.load_img(img_path)) images.append(img_original) inputs.append(img) inputs = preprocess_input(np.array(inputs)) # predict preds = self.face_detector.predict(inputs, batch_size=1, verbose=0) results = self.bbox_util.detection_out(preds) for i, img in enumerate(images): # Parse the outputs. det_label = results[i][:, 0] det_conf = results[i][:, 1] det_xmin = results[i][:, 2] det_ymin = results[i][:, 3] det_xmax = results[i][:, 4] det_ymax = results[i][:, 5] top_indices = [i for i, conf in enumerate(det_conf) if conf >= 0.3] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] for i in range(top_conf.shape[0]): xmin = int(round(top_xmin[i] * img.shape[1])) ymin = int(round(top_ymin[i] * img.shape[0])) xmax = int(round(top_xmax[i] * img.shape[1])) ymax = int(round(top_ymax[i] * img.shape[0])) score = top_conf[i] label = int(top_label_indices[i]) bb_coordinate.append(np.array([xmin, ymin, xmax, ymax])) detect_img = img_original[ymin:ymax, xmin: xmax, :] detect_img = cv2.resize(detect_img,(200, 200)) resize_imgs.append(detect_img) # リサイズ画像の配列と元画像の位置左上の(x,y), 右下の(x,y)を返す return np.array(resize_imgs), np.array(bb_coordinate), img_original
def run_camera(input_shape, model, save_path, frame_number): num_classes = 21 conf_thresh = 0.4 bbox_util = BBoxUtility(num_classes) class_colors = [] for i in range(0, num_classes): hue = 255 * i / num_classes col = np.zeros((1, 1, 3)).astype("uint8") col[0][0][0] = hue col[0][0][1] = 128 # Saturation col[0][0][2] = 255 # Value cvcol = cv2.cvtColor(col, cv2.COLOR_HSV2BGR) col = (int(cvcol[0][0][0]), int(cvcol[0][0][1]), int(cvcol[0][0][2])) class_colors.append(col) vid = cv2.VideoCapture(0) # Compute aspect ratio of video vidw = vid.get(cv2.CAP_PROP_FRAME_WIDTH) vidh = vid.get(cv2.CAP_PROP_FRAME_HEIGHT) # vidar = vidw / vidh samples = os.listdir(save_path) sample_count = len(samples) empty_count = 0 image_stack = [] while True: retval, orig_image = vid.read() if not retval: print("Done!") return None im_size = (input_shape[0], input_shape[1]) resized = cv2.resize(orig_image, im_size) rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) inputs = [image.img_to_array(rgb)] tmp_inp = np.array(inputs) x = preprocess_input(tmp_inp) y = model.predict(x) results = bbox_util.detection_out(y) if len(results) > 0 and len(results[0]) > 0: det_label = results[0][:, 0] det_conf = results[0][:, 1] det_xmin = results[0][:, 2] det_ymin = results[0][:, 3] det_xmax = results[0][:, 4] det_ymax = results[0][:, 5] top_indices = [ i for i, conf in enumerate(det_conf) if conf >= conf_thresh ] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] if 15 not in top_label_indices: empty_count += 1 if empty_count == 4: image_stack = [] empty_count = 0 else: empty_count = 0 for i in range(top_conf.shape[0]): xmin = int(round((top_xmin[i] * vidw) * 0.9)) ymin = int(round((top_ymin[i] * vidh) * 0.9)) xmax = int(round( (top_xmax[i] * vidw) * 1.1)) if int(round( (top_xmax[i] * vidw) * 1.1)) <= vidw else int( round(top_xmax[i] * vidw)) ymax = int(round( (top_ymax[i] * vidh) * 1.1)) if int(round( (top_ymax[i] * vidh) * 1.1)) <= vidh else int( round(top_ymax[i] * vidh)) # save frames class_num = int(top_label_indices[i]) if class_num == 15: cv2.rectangle(orig_image, (xmin, ymin), (xmax, ymax), class_colors[class_num], 2) frame = orig_image if len(image_stack) < frame_number: image_stack.append(frame[ymin:ymax, xmin:xmax, :]) if len(image_stack) == frame_number: image_stack.pop(0) image_stack.append(frame[ymin:ymax, xmin:xmax, :]) cv2.imshow("SSD result", orig_image) if cv2.waitKey(5) & 0xFF == ord('s'): if len(image_stack) == frame_number: if not os.path.exists(save_path + str(sample_count + 1)): os.mkdir(save_path + str(sample_count + 1)) for pic in range(frame_number): cv2.imwrite( save_path + str(sample_count + 1) + '/' + str(1000 + pic) + '.jpg', image_stack[pic]) print('saving ' + save_path + str(sample_count + 1) + '/' + str(1000 + pic) + '.jpg') image_stack = [] empty_count = 0 sample_count += 1
class VideoTest(object): """ Class for testing a trained SSD model on a video file and show the# {{{ result in a window. Class is designed so that one VideoTest object can be created for a model, and the same object can then be used on multiple videos and webcams. Arguments: class_names: A list of strings, each containing the name of a class. The first name should be that of the background class which is not used. model: An SSD model. It should already be trained for images similar to the video to test on. input_shape: The shape that the model expects for its input, as a tuple, for example (300, 300, 3) bbox_util: An instance of the BBoxUtility class in ssd_utils.py The BBoxUtility needs to be instantiated with the same number of classes as the length of class_names. """# }}} def __init__(self, class_names, model, input_shape, confidence): # {{{ self.class_names = class_names self.num_classes = len(class_names) self.model = model self.input_shape = input_shape self.confidence = confidence self.bbox_util = BBoxUtility(self.num_classes) self.next_ID = 0 # Create unique and somewhat visually distinguishable bright # colors for the different classes. self.class_colors = [] for i in range(0, self.num_classes): # This can probably be written in a more elegant manner hue = 255 * i / self.num_classes col = np.zeros((1, 1, 3)).astype("uint8") col[0][0][0] = hue col[0][0][1] = 128 # Saturation col[0][0][2] = 255 # Value cvcol = cv2.cvtColor(col, cv2.COLOR_HSV2BGR) col = (int(cvcol[0][0][0]), int(cvcol[0][0][1]), int(cvcol[0][0][2])) self.class_colors.append(col) # }}} def run(self, video_path=0, start_frame=0, conf_thresh=0): """ Runs the test on a video (or webcam) # {{{ # Arguments video_path: A file path to a video to be tested on. Can also be a number, in which case the webcam with the same number (i.e. 0) is used instead start_frame: The number of the first frame of the video to be processed by the network. conf_thresh: Threshold of confidence. Any boxes with lower confidence are not visualized. """ vid = cv2.VideoCapture(video_path) if not vid.isOpened(): raise IOError(( "Couldn't open video file or webcam. If you're " "trying to open a webcam, make sure you video_path is an integer!" )) # }}} # Compute aspect ratio of video # {{{ #msvidw = vid.get(cv2.cv.CV_CAP_PROP_FRAME_WIDTH) #vidh = vid.get(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT) vidw = vid.get(cv2.CAP_PROP_FRAME_WIDTH) vidh = vid.get(cv2.CAP_PROP_FRAME_HEIGHT) vidar = vidw / vidh # }}} # Skip frames until reaching start_frame# {{{ if start_frame > 0: vid.set(cv2.cv.CV_CAP_PROP_POS_MSEC, start_frame) accum_time = 0 video_time = 0 curr_fps = 0 fps = "FPS: ??" prev_time = timer() # }}} gx, gy, gt, gid = [], [], [], [] hsv = [[int(np.random.rand() * 100), 255, 255] for i in range(255)] for i in range(len(hsv)): hsv[i][0] = (29 * i) % 100 #color = np.random.rand(1024,3) color = [] for i in range(len(hsv)): color.append(hsv2rgb(hsv[i][0], hsv[i][1], hsv[i][2])) color[i][0] = float(color[i][0] / 255) color[i][1] = float(color[i][1] / 255) color[i][2] = float(color[i][2] / 255) #4 point designation w = 4.3 h = 5.4 pts1 = np.float32([[650, 298], [1275, 312], [494, 830], [1460, 845]]) pts1 *= self.input_shape[1] / vidh pts2 = np.float32([[0, 0], [w, 0], [0, h], [w, h]]) Homography = cv2.getPerspectiveTransform(pts1, pts2) Homography2 = cv2.getPerspectiveTransform(pts2, pts1) dt = 1 / vid.get(cv2.CAP_PROP_FPS) trackers = [] pub_gauss1 = rospy.Publisher('gauss1', PoseWithCovarianceStamped, queue_size=10) pub_gauss2 = rospy.Publisher('gauss2', PoseWithCovarianceStamped, queue_size=10) pub_gauss3 = rospy.Publisher('gauss3', PoseWithCovarianceStamped, queue_size=10) pub_markers = rospy.Publisher('markers', MarkerArray, queue_size=10) rospy.init_node('tracker', anonymous=True) r = rospy.Rate(10) gauss1 = PoseWithCovarianceStamped() gauss2 = PoseWithCovarianceStamped() gauss3 = PoseWithCovarianceStamped() gauss1.header.frame_id = "map" gauss2.header.frame_id = "map" gauss3.header.frame_id = "map" markers = MarkerArray() plots = [] while not rospy.is_shutdown(): retval, orig_image = vid.read() # {{{ if not retval: print("Done!") break #return im_size = (self.input_shape[0], self.input_shape[1]) #(300,300) resized = cv2.resize(orig_image, im_size) rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) # Reshape to original aspect ratio for later visualization # The resized version is used, to visualize what kind of resolution # the network has to work with. to_draw = cv2.resize( resized, (int(self.input_shape[0] * vidar), self.input_shape[1])) # Use model to predict inputs = [image.img_to_array(rgb)] tmp_inp = np.array(inputs) X = preprocess_input(tmp_inp) Y = self.model.predict(X) # This line creates a new TensorFlow device every time. Is there a # way to avoid that? results = self.bbox_util.detection_out(Y) # }}} new_datas = [] if len(results) > 0 and len(results[0]) > 0: # Interpret output, only one frame is used det_label = results[0][:, 0] det_conf = results[0][:, 1] det_xmin = results[0][:, 2] det_ymin = results[0][:, 3] det_xmax = results[0][:, 4] det_ymax = results[0][:, 5] # top_indices = [i for i, conf in enumerate(det_conf) if conf >= conf_thresh] top_indices = [ i for i, conf in enumerate(det_conf) if conf >= self.confidence ] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] #Bbox for i in range(top_conf.shape[0]): xmin = int(round(top_xmin[i] * to_draw.shape[1])) ymin = int(round(top_ymin[i] * to_draw.shape[0])) xmax = int(round(top_xmax[i] * to_draw.shape[1])) ymax = int(round(top_ymax[i] * to_draw.shape[0])) # Draw the box on top of the to_draw image class_num = int(top_label_indices[i]) if ((self.class_names[class_num] == 'person') & (top_conf[i] >= 0.9)): #0.6#0.9#0.996 cv2.rectangle(to_draw, (xmin, ymin), (xmax, ymax), self.class_colors[class_num], 2) text = self.class_names[class_num] + " " + ( '%.2f' % top_conf[i]) text_top = (xmin, ymin - 10) text_bot = (xmin + 80, ymin + 5) text_pos = (xmin + 5, ymin) cv2.rectangle(to_draw, text_top, text_bot, self.class_colors[class_num], -1) #print(text , '%.2f' % video_time , ( (xmin+xmax)/2, ymax ) ) cv2.putText(to_draw, text, text_pos, cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0, 0, 0), 1) cv2.circle(to_draw, ((xmin + xmax) / 2, ymax), 3, (0, 0, 255), -1) imagepoint = [[(xmin + xmax) / 2], [ymax], [1]] groundpoint = np.dot(Homography, imagepoint) groundpoint = (groundpoint / groundpoint[2]).tolist() groundpoint[0] = groundpoint[0][0] groundpoint[1] = groundpoint[1][0] groundpoint[2] = groundpoint[2][0] # if((0<=groundpoint[0]) & (groundpoint[0]<=w) & (0<=groundpoint[1]) & (groundpoint[1]<=h)): # print(text , '%.2f' % video_time , ('%.2f' % groundpoint[0] , '%.2f' % groundpoint[1]) ) # gx.append(groundpoint[0]) # gy.append(groundpoint[1]) # gt.append(video_time) # new_datas.append([gx[-1],gy[-1],gt[-1],0]) print(text, '%.2f' % video_time, ('%.2f' % groundpoint[0], '%.2f' % groundpoint[1])) gx.append(groundpoint[0]) gy.append(groundpoint[1]) gt.append(video_time) new_datas.append([gx[-1], gy[-1], gt[-1], 0]) # motion update for i in range(len(trackers)): trackers[i].kf_motion() # measurement update for i in range(len(trackers)): for j in range(len(new_datas)): if (trackers[i].in_error_ellipse( trackers[i].x - new_datas[j][0], trackers[i].y - new_datas[j][1])): trackers[i].kf_measurement_update( new_datas[j][0], new_datas[j][1]) trackers[i].update(trackers[i].x, trackers[i].y, video_time) # plot(trackers[i].x,trackers[i].y,i,trackers[i].col,Homography2,to_draw,plots) gid.append(trackers[i].ID) new_datas[j][3] = 1 plot(trackers[i].x, trackers[i].y, i, trackers[i].col, Homography2, to_draw, plots) # ROS pose with coveriance# {{{ if (len(trackers)): gauss1.pose.pose.position.x = trackers[0].x gauss1.pose.pose.position.y = trackers[0].y theta = m.atan(trackers[0].vy / trackers[0].vx) q = tf.transformations.quaternion_from_euler(theta, 0, 0) gauss1.pose.pose.orientation.x = q[0] gauss1.pose.pose.orientation.y = q[1] gauss1.pose.pose.orientation.z = q[2] gauss1.pose.pose.orientation.w = q[3] gauss1.pose.covariance = np.zeros(36) gauss1.pose.covariance[0] = trackers[0].P[0, 0] gauss1.pose.covariance[1] = trackers[0].P[0, 1] gauss1.pose.covariance[6] = trackers[0].P[1, 0] gauss1.pose.covariance[7] = trackers[0].P[1, 1] pub_gauss1.publish(gauss1) if (len(trackers) > 1): gauss2.pose.pose.position.x = trackers[1].x gauss2.pose.pose.position.y = trackers[1].y theta = m.atan(trackers[1].vy / trackers[1].vx) q = tf.transformations.quaternion_from_euler(0, 0, theta) gauss2.pose.pose.orientation.x = q[0] gauss2.pose.pose.orientation.y = q[1] gauss2.pose.pose.orientation.z = q[2] gauss2.pose.pose.orientation.w = q[3] gauss2.pose.covariance = np.zeros(36) gauss2.pose.covariance[0] = trackers[1].P[0, 0] gauss2.pose.covariance[1] = trackers[1].P[0, 1] gauss2.pose.covariance[6] = trackers[1].P[1, 0] gauss2.pose.covariance[7] = trackers[1].P[1, 1] pub_gauss2.publish(gauss2) if (len(trackers) > 2): gauss3.pose.pose.position.x = trackers[2].x gauss3.pose.pose.position.y = trackers[2].y theta = m.atan(trackers[2].vy / trackers[2].vx) q = tf.transformations.quaternion_from_euler(0, 0, theta) gauss3.pose.pose.orientation.x = q[0] gauss3.pose.pose.orientation.y = q[1] gauss3.pose.pose.orientation.z = q[2] gauss3.pose.pose.orientation.w = q[3] gauss3.pose.covariance = np.zeros(36) gauss3.pose.covariance[0] = trackers[1].P[0, 0] gauss3.pose.covariance[1] = trackers[1].P[0, 1] gauss3.pose.covariance[6] = trackers[1].P[1, 0] gauss3.pose.covariance[7] = trackers[1].P[1, 1] pub_gauss3.publish(gauss3) # }}} #scores = [[0 for i in range(len(new_datas))] for j in range(len(trackers))] #for i in range(len(trackers)): # trackers[i].kf_motion() # for j in range(len(new_datas)): # scores[i][j] = tracker[i].pro_dens_2d(new_datas[j][0],new_datas[j][1]) #generate new tracker for i in range(len(new_datas)): if (new_datas[i][3] == 0): newdetec = len(gx) - len(new_datas) + i trackers.append( Tracker(self.next_ID, gx[newdetec], gy[newdetec], video_time, dt)) gid.append(self.next_ID) plot(trackers[self.next_ID].x, trackers[self.next_ID].y, self.next_ID, trackers[self.next_ID].col, Homography2, to_draw, plots) self.next_ID += 1 # Calculate FPS# {{{ # This computes FPS for everything, not just the model's execution # which may or may not be what you want curr_time = timer() exec_time = curr_time - prev_time prev_time = curr_time accum_time = accum_time + exec_time video_time = video_time + 1 / vid.get(cv2.CAP_PROP_FPS) curr_fps = curr_fps + 1 if accum_time > 1: accum_time = accum_time - 1 fps = "FPS: " + str(curr_fps) curr_fps = 0 # }}} # Draw FPS in top left corner# {{{ cv2.rectangle(to_draw, (0, 0), (50, 17), (255, 255, 255), -1) cv2.putText(to_draw, fps, (3, 10), cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0, 0, 0), 1) # }}} for i in range(len(plots)): cv2.circle(to_draw, (plots[i][0], plots[i][1]), 3, (plots[i][2][0] * 255, plots[i][2][1] * 255, plots[i][2][2] * 255), -1) for i in range(len(gx)): marker = Marker() marker.header.frame_id = "map" marker.header.stamp = rospy.Time.now() marker.ns = "basic_shapes" marker.id = i marker.type = 2 #sphere marker.action = Marker.ADD marker.pose.position.x = gx[i] marker.pose.position.y = gy[i] marker.pose.position.z = 0. marker.pose.orientation.x = 0. marker.pose.orientation.y = 0. marker.pose.orientation.z = 0. marker.pose.orientation.w = 1. marker.scale.x = 0.1 marker.scale.y = 0.1 marker.scale.z = 0.1 marker.color.r = color[gid[i]][2] marker.color.g = color[gid[i]][1] marker.color.b = color[gid[i]][0] marker.color.a = 1. marker.lifetime = rospy.Duration(0) markers.markers.append(marker) #print len(markers.markers) pub_markers.publish(markers) del markers.markers[:] # draw a sqare# {{{ cv2.line(to_draw, (pts1[0][0], pts1[0][1]), (pts1[1][0], pts1[1][1]), (100, 200, 100), thickness=2) cv2.line(to_draw, (pts1[0][0], pts1[0][1]), (pts1[2][0], pts1[2][1]), (100, 200, 100), thickness=2) cv2.line(to_draw, (pts1[3][0], pts1[3][1]), (pts1[1][0], pts1[1][1]), (100, 200, 100), thickness=2) cv2.line(to_draw, (pts1[3][0], pts1[3][1]), (pts1[2][0], pts1[2][1]), (100, 200, 100), thickness=2) # }}}# }}} cv2.imshow("SSD result", to_draw) cv2.waitKey(10) r.sleep() #create graph# {{{ #fig = plt.figure() #ax=Axes3D(fig) #color = np.random.rand(len(trackers),3) #for i in range(len(gx)): # iro = (color[gid[i]][2],color[gid[i]][1],color[gid[i]][0]) # ax.scatter(gx[i],gy[i],gt[i],s=5,c=iro) #ax.scatter(gx, gy, gt, s=5, c="blue") #ax.set_xlabel('x') #ax.set_ylabel('y') #ax.set_zlabel('t') #plt.show()# }}} cv2.destroyAllWindows() vid.release() return
images.append(imread(img_path)) inputs.append(img.copy()) img_path = './pics/1.png' img = image.load_img(img_path, target_size=(300, 300)) img = image.img_to_array(img) images.append(imread(img_path)) inputs.append(img.copy()) inputs = preprocess_input(np.array(inputs)) # In[5]: preds = model.predict(inputs, batch_size=1, verbose=1) # In[6]: results = bbox_util.detection_out(preds) # In[8]: count = 0 for i, img in enumerate(images): # Parse the outputs. det_label = results[i][:, 0] det_conf = results[i][:, 1] det_xmin = results[i][:, 2] det_ymin = results[i][:, 3] det_xmax = results[i][:, 4] det_ymax = results[i][:, 5] # Get detections with confidence higher than 0.6. top_indices = [i for i, conf in enumerate(det_conf) if conf >= 0.3] #0.6
def create_model_prediction(self, n_images=400): priors = pickle.load( open( os.path.join( os.path.dirname(os.path.realpath(__file__)), 'priorFiles/prior_boxes_ssd300MobileNetV2_224_224.pkl'), 'rb')) np.set_printoptions(suppress=True) bbox_util = BBoxUtility(NUM_CLASSES, priors) inputs = [] images = [] result_images = [] annotation_files = [] print('Prepare : {} files for evaluation. '.format(n_images)) input_shape = (self.sets['img_height'], self.sets['img_width'], 3) with open( os.path.join(self.sets['dataset_dir'], 'VOC2007/ImageSets/Main/test.txt'), 'r') as annot_f: for annotation in tqdm(list(annot_f)[:n_images]): try: img_path = os.path.join( self.sets['dataset_dir'], 'VOC2007/JPEGImages/' ) + annotation.split(' ')[0].strip() + '.jpg' img = image.load_img(img_path, target_size=(input_shape[0], input_shape[1])) img = image.img_to_array(img) result_images.append(img_path) images.append(img) inputs.append(img.copy()) annotation_files.append(annotation) except Exception as e: print('Error while opening file.', e) result_detections = [] # inputs = preprocess_input(np.array(inputs)[:, :, :, ::-1], mode="tf") inputs = np.array(inputs) inputs = preprocess_input(inputs) print('inputs: {}'.format(inputs.shape)) results = [] for img in tqdm(inputs): # self.m._model.set_input(input_name, tvm.nd.array(img)) # self.m._model.run() tvm_output = self.m.predict_on_batch(img) # ftimer = m.module.time_evaluator("run", ctx, number=1, repeat=1) # prof_res = np.array(ftimer().results) * 1000 # convert to millisecond tvm_output = self.m._model.get_output(0) img_result = bbox_util.detection_out(tvm_output.asnumpy()) results.append(img_result) results = np.array(results) results = np.squeeze(results, axis=1) print('results: {}'.format(results.shape)) for i, img in tqdm(enumerate(images)): det_label = results[i][:, 0] det_conf = results[i][:, 1] det_xmin = results[i][:, 2] det_ymin = results[i][:, 3] det_xmax = results[i][:, 4] det_ymax = results[i][:, 5] top_indices = [ i for i, conf in enumerate(det_conf) if conf >= THRESHOLD ] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] detections = [] for i in range(top_conf.shape[0]): ''' xmin = int(round(top_xmin[i] * img.shape[1])) ymin = int(round(top_ymin[i] * img.shape[0])) xmax = int(round(top_xmax[i] * img.shape[1])) ymax = int(round(top_ymax[i] * img.shape[0])) ''' xmin = top_xmin[i] ymin = top_ymin[i] xmax = top_xmax[i] ymax = top_ymax[i] score = top_conf[i] label = int(top_label_indices[i]) label_name = CLASSES[label - 1] detections.append([ '{:.2f}'.format(xmin), '{:.2f}'.format(ymin), '{:.2f}'.format(xmax), '{:.2f}'.format(ymax), label_name, '{:.2f}'.format(score) ]) result_detections.append(detections) print('Test images: {}'.format(len(result_images))) print('result_detections: {}'.format(len(result_detections))) model_predictions = [] MODEL_PREDICTION_PATH = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'model_evaluation/model_prediction/') predicted_images = [] for index, image_filename in tqdm(enumerate(result_images)): image_name = os.path.basename(image_filename) path_elements = image_name[:-4] predicted_images.append(image_name[:-4]) annot_dir = os.path.join(MODEL_PREDICTION_PATH) os.makedirs(annot_dir, exist_ok=True) annot_name = '{}.txt'.format(path_elements) annot_filename = os.path.join(annot_dir, annot_name) with open(annot_filename, 'w') as output_f: for d in result_detections[index]: left, top, right, botton, classe, score = d[0], d[1], d[ 2], d[3], d[4], d[5] model_predictions.append( (classe, score, left, top, right, botton)) output_f.write('{} {} {} {} {} {}\n'.format( classe, score, left, top, right, botton)) GROUND_TRUTH_LABELS = os.path.join(self.sets['dataset_dir'], 'VOC2007/Annotations') GROUND_TRUTH_PATH = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'model_evaluation/ground_truth/') for f in glob(GROUND_TRUTH_PATH + '*'): os.remove(f) filenames = os.listdir(GROUND_TRUTH_LABELS) ground_images = [] for filename in tqdm(filenames): if filename[:-4] not in predicted_images: continue ground_images.append(filename[:-4]) tree = ElementTree.parse( os.path.join(GROUND_TRUTH_LABELS + '/{}'.format(filename))) root = tree.getroot() bounding_boxes = [] one_hot_classes = [] size_tree = root.find('size') width = float(size_tree.find('width').text) height = float(size_tree.find('height').text) for object_tree in root.findall('object'): for bounding_box in object_tree.iter('bndbox'): xmin = float(bounding_box.find('xmin').text) / width ymin = float(bounding_box.find('ymin').text) / height xmax = float(bounding_box.find('xmax').text) / width ymax = float(bounding_box.find('ymax').text) / height class_name = object_tree.find('name').text.title() bounding_box = [class_name, xmin, ymin, xmax, ymax] bounding_boxes.append(bounding_box) with open( os.path.join(GROUND_TRUTH_PATH, filename.replace('xml', 'txt')), 'w+') as f: for p in bounding_boxes: f.write(' '.join([str(s) for s in p]) + "\n") print('Completed eval preparation') assert len(ground_images) == len(predicted_images)
class VideoTest(object): """ Class for testing a trained SSD model on a video file and show the result in a window. Class is designed so that one VideoTest object can be created for a model, and the same object can then be used on multiple videos and webcams. Arguments: class_names: A list of strings, each containing the name of a class. The first name should be that of the background class which is not used. model: An SSD model. It should already be trained for images similar to the video to test on. input_shape: The shape that the model expects for its input, as a tuple, for example (300, 300, 3) bbox_util: An instance of the BBoxUtility class in ssd_utils.py The BBoxUtility needs to be instantiated with the same number of classes as the length of class_names. """ def __init__(self, class_names, model, input_shape): self.class_names = class_names self.num_classes = len(class_names) self.model = model self.input_shape = input_shape self.bbox_util = BBoxUtility(self.num_classes) # Create unique and somewhat visually distinguishable bright # colors for the different classes. self.class_colors = [] for i in range(0, self.num_classes): # This can probably be written in a more elegant manner hue = 255*i/self.num_classes col = np.zeros((1,1,3)).astype("uint8") col[0][0][0] = hue col[0][0][1] = 128 # Saturation col[0][0][2] = 255 # Value cvcol = cv2.cvtColor(col, cv2.COLOR_HSV2BGR) col = (int(cvcol[0][0][0]), int(cvcol[0][0][1]), int(cvcol[0][0][2])) self.class_colors.append(col) def run(self, video_path = 0, start_frame = 0, conf_thresh = 0.6): """ Runs the test on a video (or webcam) # Arguments video_path: A file path to a video to be tested on. Can also be a number, in which case the webcam with the same number (i.e. 0) is used instead start_frame: The number of the first frame of the video to be processed by the network. conf_thresh: Threshold of confidence. Any boxes with lower confidence are not visualized. """ vid = cv2.VideoCapture(video_path) if not vid.isOpened(): raise IOError(("Couldn't open video file or webcam. If you're " "trying to open a webcam, make sure you video_path is an integer!")) # Compute aspect ratio of video vidw = vid.get(cv2.cv.CV_CAP_PROP_FRAME_WIDTH) vidh = vid.get(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT) vidar = vidw/vidh # Skip frames until reaching start_frame if start_frame > 0: vid.set(cv2.cv.CV_CAP_PROP_POS_MSEC, start_frame) accum_time = 0 curr_fps = 0 fps = "FPS: ??" prev_time = timer() while True: retval, orig_image = vid.read() if not retval: print("Done!") return im_size = (self.input_shape[0], self.input_shape[1]) resized = cv2.resize(orig_image, im_size) rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) # Reshape to original aspect ratio for later visualization # The resized version is used, to visualize what kind of resolution # the network has to work with. to_draw = cv2.resize(resized, (int(self.input_shape[0]*vidar), self.input_shape[1])) # Use model to predict inputs = [image.img_to_array(rgb)] tmp_inp = np.array(inputs) x = preprocess_input(tmp_inp) y = self.model.predict(x) # This line creates a new TensorFlow device every time. Is there a # way to avoid that? results = self.bbox_util.detection_out(y) if len(results) > 0 and len(results[0]) > 0: # Interpret output, only one frame is used det_label = results[0][:, 0] det_conf = results[0][:, 1] det_xmin = results[0][:, 2] det_ymin = results[0][:, 3] det_xmax = results[0][:, 4] det_ymax = results[0][:, 5] top_indices = [i for i, conf in enumerate(det_conf) if conf >= conf_thresh] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] for i in range(top_conf.shape[0]): xmin = int(round(top_xmin[i] * to_draw.shape[1])) ymin = int(round(top_ymin[i] * to_draw.shape[0])) xmax = int(round(top_xmax[i] * to_draw.shape[1])) ymax = int(round(top_ymax[i] * to_draw.shape[0])) # Draw the box on top of the to_draw image class_num = int(top_label_indices[i]) cv2.rectangle(to_draw, (xmin, ymin), (xmax, ymax), self.class_colors[class_num], 2) text = self.class_names[class_num] + " " + ('%.2f' % top_conf[i]) text_top = (xmin, ymin-10) text_bot = (xmin + 80, ymin + 5) text_pos = (xmin + 5, ymin) cv2.rectangle(to_draw, text_top, text_bot, self.class_colors[class_num], -1) cv2.putText(to_draw, text, text_pos, cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0,0,0), 1) # Calculate FPS # This computes FPS for everything, not just the model's execution # which may or may not be what you want curr_time = timer() exec_time = curr_time - prev_time prev_time = curr_time accum_time = accum_time + exec_time curr_fps = curr_fps + 1 if accum_time > 1: accum_time = accum_time - 1 fps = "FPS: " + str(curr_fps) curr_fps = 0 # Draw FPS in top left corner cv2.rectangle(to_draw, (0,0), (50, 17), (255,255,255), -1) cv2.putText(to_draw, fps, (3,10), cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0,0,0), 1) cv2.imshow("SSD result", to_draw) cv2.waitKey(10)
def predict_to_coord(model_weights, image_path, band=[4, 3, 2], conf_threshold=0.5): ''' Input : model_weights 训练好的模型权重 image_path 要预测的图像 eg: r'...\to_xiangyu\geo_.tif' band 选区的波段 第一波段标号为 1 cconf_threshold 输出结果的阈值大小 Output : Prj_id 影像的 地理参考系编号 result_items list (行号,列号,置信度,坐标 行号,列号,置信度,坐标) ... ''' voc_classes = ['popp'] NUM_CLASSES = len(voc_classes) + 1 input_shape = (300, 300, 3) model = SSD300(input_shape, num_classes=NUM_CLASSES) #model.load_weights('./checkpoints_330/SSD_4_100_0.0002.hdf5', by_name=True) model.load_weights(model_weights, by_name=True) bbox_util = BBoxUtility(NUM_CLASSES) #dataset = gdal.Open(r'C:\Liuxiangyu\毕业实验\基础数据\to_xiangyu\geo_.tif') dataset = gdal.Open(image_path) GeoTransform = dataset.GetGeoTransform() x0, y0 = GeoTransform[0], GeoTransform[3] #影像左上方坐标 x_pixel, y_pixel = GeoTransform[1], GeoTransform[5] #行列 分辨率 Prj_id = int(dataset.GetProjection().split('"')[-2]) #投影参数,eg: 4326 im_width = dataset.RasterXSize #栅格矩阵的列数 im_height = dataset.RasterYSize #栅格矩阵的行数 im_band = dataset.RasterCount #im_data = dataset.ReadAsArray(0,0,im_width,im_height) #get shapelike(width, height, band) #im_data = np.swapaxes(im_data,0,1) #im_data = np.swapaxes(im_data,2,1) if np.max(band) > im_band: raise Exception('the parameter band is out of all bands') # if im_band == 5: # im_datas =im_data[...,band] # # elif im_band == 4: # im_datas =im_data[...,[2,1,0]] # else: # raise Exception('only can open rasters which has 4 or 5 bands') band = [b - 1 for b in band] #im_datas =im_data[...,band] #input_size =300 y_num = int(im_height / overlap) x_num = int(im_width / overlap) # x_num =math.ceil(im_width/input_size) # y_num =math.ceil(im_height/input_size) result_items = [] for row in range(y_num): for column in range(x_num): y_st, x_st = overlap * row, overlap * column if im_width >= x_st + cut_size: if im_height >= y_st + cut_size: im_data = dataset.ReadAsArray(x_st, y_st, cut_size, cut_size) else: im_data = dataset.ReadAsArray(x_st, y_st, cut_size, im_height - y_st) else: if im_height >= y_st + cut_size: im_data = dataset.ReadAsArray(x_st, y_st, im_width - x_st, cut_size) else: im_data = dataset.ReadAsArray(x_st, y_st, im_width - x_st, im_height - y_st) im_data[im_data > 65534] = 0 x = np.swapaxes(im_data, 0, 1) x = np.swapaxes(x, 2, 1) im_data = x[..., band] # im_data = im_datas[row*input_size:(row+1)*input_size, column*input_size:(column+1)*input_size, :] im_data_max = im_data.max() im_data_min = im_data.min() inputs = [] images = [] mask = im_data.copy() mask[mask > 0] = 1 data_radio = np.sum(mask) / (3 * cut_size**2 ) #计算无效区的像素占比, 少于一半的不参与预测 if data_radio > 0.4: input_data = (im_data - im_data_min) / (im_data_max - im_data_min) * 255 input_data = cv2.resize(input_data, (300, 300)) images.append(input_data) inputs.append(input_data) inputs = preprocess_input(np.array(inputs)) preds = model.predict(inputs, batch_size=1, verbose=1) results = bbox_util.detection_out(preds) for i, img in enumerate(images): # Parse the outputs. #det_label =results[i][:, 0] try: det_conf = results[i][:, 1] det_xmin = results[i][:, 2] det_ymin = results[i][:, 3] det_xmax = results[i][:, 4] det_ymax = results[i][:, 5] # Get detections with confidence higher than 0.6. top_indices = [ i for i, conf in enumerate(det_conf) if conf >= conf_threshold ] top_conf = det_conf[top_indices] #top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] except: print('this image has no result, has pass') pass for i in range(top_conf.shape[0]): # xmin = int(round(top_xmin[i] * img.shape[1])) # ymin = int(round(top_ymin[i] * img.shape[0])) # xmax = int(round(top_xmax[i] * img.shape[1])) # ymax = int(round(top_ymax[i] * img.shape[0])) score = top_conf[i] rel_xmin = x0 + x_pixel * cut_size * (top_xmin[i] + column) rel_ymin = y0 + y_pixel * cut_size * ( top_ymin[i] + row) #此处y_pixel为负数表示向下 rel_xmax = x0 + x_pixel * cut_size * (top_xmax[i] + column) rel_ymax = y0 + y_pixel * cut_size * ( top_ymax[i] + row) #此处y_pixel为负数表示向下 coods = ((rel_xmin, rel_ymin), (rel_xmax, rel_ymin), (rel_xmax, rel_ymax), (rel_xmin, rel_ymax), (rel_xmin, rel_ymin)) item = (row, column, score, coods) #(行号,列号,置信度,坐标) result_items.append(item) del dataset return Prj_id, result_items