def __init__(self, model, input_shape, name_classes): print("カメラやビデオを処理する!") self.model = model self.num_classes = len(name_classes) self.name_classes = name_classes self.width, self.height = input_shape[0], input_shape[1] self.window_pos_x, self.window_pos_y = (60, 40) self.bbox_util = BBoxUtility(num_classes=self.num_classes)
def __init__(self, class_names, model, input_shape): self.class_names = class_names self.num_classes = len(class_names) self.model = model self.input_shape = input_shape self.bbox_util = BBoxUtility(self.num_classes) # Create unique and somewhat visually distinguishable bright # colors for the different classes. self.class_colors = [] for i in range(0, self.num_classes): # This can probably be written in a more elegant manner hue = 255*i/self.num_classes col = np.zeros((1,1,3)).astype("uint8") col[0][0][0] = hue col[0][0][1] = 128 # Saturation col[0][0][2] = 255 # Value cvcol = cv2.cvtColor(col, cv2.COLOR_HSV2BGR) col = (int(cvcol[0][0][0]), int(cvcol[0][0][1]), int(cvcol[0][0][2])) self.class_colors.append(col)
def get_detection_data(self): gt = pickle.load(open(self.gt_path, "rb")) name_keys = sorted(gt.keys()) number = int(round(0.8 * len(name_keys))) train_keys = name_keys[:number] val_keys = name_keys[number:] bbox_util_ = BBoxUtility(self.num_classes, gt) gen = Generator(bbox_util_, self.image_path, self.batch_size, train_keys, val_keys, (self.input_shape[0], self.input_shape[1]), num_classes=self.num_classes) return gen
def __init__(self,num_classes = 21, input_shape=(300,300,3),epochs=12): self.num_classes = num_classes self.batch_size = 4 self.input_shape = input_shape self.epochs = epochs self.gt_path = data_path self.image_path = image_path prior = pickle.load(open(start_data,"rb")) self.bbox_util =BBoxUtility(self.num_classes,prior) self.pre_trained = weight_path self.model = SSD300(self.input_shape,num_classes=self.num_classes)
from ssd import SSD300 from utils.prior_box_creator import PriorBoxCreator from image_generator import ImageGenerator num_classes = 21 model = SSD300() image_shape = model.input_shape[1:] box_creator = PriorBoxCreator(model) prior_boxes = box_creator.create_boxes() root_prefix = '../datasets/VOCdevkit/VOC2007/' ground_data_prefix = root_prefix + 'Annotations/' image_prefix = root_prefix + 'JPEGImages/' ground_truth_manager = XMLParser(ground_data_prefix, background_id=None) ground_truth_data = ground_truth_manager.get_data() prior_boxes = flatten_prior_boxes(prior_boxes) prior_boxes = add_variances(prior_boxes) print('WTF') bbox_util = BBoxUtility(num_classes, prior_boxes) result = bbox_util.assign_boxes(ground_truth_data['000007.jpg']) train_keys, val_keys = split_data(ground_truth_data, training_ratio=.8) image_generator = ImageGenerator(ground_truth_data, bbox_util, 10, (300, 300), train_keys, val_keys, image_prefix) data = next(image_generator.flow(mode='train')) # test the differences here between you bbox_util # why can't you train with this ?
import time from PyQt5 import QtCore, QtGui, QtWidgets from models.model_c3d import * from models.model_2d import * from models.ssd import SSD300 as SSD from utils.clip_detector import process_image from utils.pose_detector import detect_image from utils.ssd_utils import BBoxUtility from utils.processing import preprocessing from config import * import tensorflow as tf config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) bbox_util = BBoxUtility(21) ssd_model = SSD(ssd_input_shape, num_classes=21) ssd_model.load_weights('weights_SSD300.hdf5') c3d = c3d_model(c3d_input_shape, nb_classes=len(action_classes)) c3d.load_weights('results/weights_c3d_mask.h5') cnn = cnn_2d(cnn_2d_input_shape, nb_classes=len(pose_classes)) cnn.load_weights('results/cnn_2d_{0}.h5'.format(mode)) class Ui_MainWindow(QtWidgets.QWidget): def __init__(self, parent=None): super(Ui_MainWindow, self).__init__(parent) self.timer_camera = QtCore.QTimer() self.cap = cv2.VideoCapture() self.CAM_NUM = 0
class Video_tracker: def __init__(self, model, input_shape, name_classes): print("カメラやビデオを処理する!") self.model = model self.num_classes = len(name_classes) self.name_classes = name_classes self.width, self.height = input_shape[0], input_shape[1] self.window_pos_x, self.window_pos_y = (60, 40) self.bbox_util = BBoxUtility(num_classes=self.num_classes) def run(self, filepath, conf_thresh=0.4): """ """ frame = cv.imread(filepath) src = np.copy(frame) resized = cv.resize(frame, (self.width, self.height)) rgb = cv.cvtColor(resized, cv.COLOR_BGR2RGB) src_shape = src.shape inputs = [img_to_array(rgb)] x = preprocess_input(np.array(inputs)) y = self.model.predict(x) results = self.bbox_util.detection_out(y) to_draw = cv.resize(resized, (int(src_shape[1]), int(src_shape[0]))) if len(results) > 0 and len(results[0]) > 0: # Interpret output, only one frame is used det_label = results[0][:, 0] det_conf = results[0][:, 1] det_xmin = results[0][:, 2] det_ymin = results[0][:, 3] det_xmax = results[0][:, 4] det_ymax = results[0][:, 5] top_indices = [ i for i, conf in enumerate(det_conf) if conf >= conf_thresh ] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] for i in range(top_conf.shape[0]): xmin = int(round(top_xmin[i] * to_draw.shape[1])) ymin = int(round(top_ymin[i] * to_draw.shape[0])) xmax = int(round(top_xmax[i] * to_draw.shape[1])) ymax = int(round(top_ymax[i] * to_draw.shape[0])) class_num = int(top_label_indices[i]) cv.rectangle(src, (xmin, ymin), (xmax, ymax), (0, 0, 255), 2) text = self.name_classes[class_num] + " " + ('%.2f' % top_conf[i]) text_top = (xmin, ymin - 10) text_bot = (xmin + 80, ymin + 5) text_pos = (xmin + 5, ymin) cv.rectangle(src, text_top, text_bot, (0, 255, 0), -1) cv.putText(src, text, text_pos, cv.FONT_HERSHEY_SIMPLEX, 0.35, (0, 0, 0), 1) cv.imshow("pic", src) cv.waitKey(0) cv.destroyAllWindows() cv.imwrite("output.png", src)
class VideoTest(object): def __init__(self, class_names, model, input_shape): self.class_names = class_names self.num_classes = len(class_names) self.model = model self.input_shape = input_shape self.bbox_util = BBoxUtility(self.num_classes) # Create unique and somewhat visually distinguishable bright # colors for the different classes. self.class_colors = [] for i in range(0, self.num_classes): # This can probably be written in a more elegant manner hue = 255*i/self.num_classes col = np.zeros((1,1,3)).astype("uint8") col[0][0][0] = hue col[0][0][1] = 128 # Saturation col[0][0][2] = 255 # Value cvcol = cv2.cvtColor(col, cv2.COLOR_HSV2BGR) col = (int(cvcol[0][0][0]), int(cvcol[0][0][1]), int(cvcol[0][0][2])) self.class_colors.append(col) def run(self, video_path = 0, start_frame = 0, conf_thresh = 0): #動画ファイル準備 print('------------------------------------------------') print("input filename.mov or 0") print("input the name of video : ", end='') videoName = input() if videoName == '0': videoName = 'WebCam.mov' save(0) videoPass = '******' vid = cv2.VideoCapture(videoPass) total_frames = vid.get(cv2.CAP_PROP_FRAME_COUNT) tmp_key = 1 print('Total frames : ', total_frames) else: videoPass = '******' + videoName vid = cv2.VideoCapture(videoPass) total_frames = vid.get(cv2.CAP_PROP_FRAME_COUNT) tmp_key = 1 print('Total frames : ', total_frames) print('------------------------------------------------') if not vid.isOpened(): raise IOError(("Couldn't open video file or webcam. If you're " "trying to open a webcam, make sure you video_path is an integer!")) # Compute aspect ratio of video vidw = vid.get(cv2.CAP_PROP_FRAME_WIDTH) vidh = vid.get(cv2.CAP_PROP_FRAME_HEIGHT) vidar = vidw/vidh # Skip frames until reaching start_frame if start_frame > 0: vid.set(cv2.CAP_PROP_POS_MSEC, start_frame) #動画ファイル書き出し frame_rate=24 fourcc=cv2.VideoWriter_fourcc('m','p','4','v') f_v='../processed/' + videoName video=cv2.VideoWriter(f_v, fourcc, frame_rate, (int(vidw), int(vidh))) # プログレスバーを表示 if tmp_key != 0: pbar = tqdm(total=total_frames) else: print('processing...') while vid.isOpened(): #プログレスバーを進める if tmp_key != 0: pbar.update(1) # 全フレーム終了 retval, orig_image = vid.read() if not retval: print("Done!") return im_size = (self.input_shape[0], self.input_shape[1]) resized = cv2.resize(orig_image, im_size) rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) # Reshape to original aspect ratio for later visualization # The resized version is used, to visualize what kind of resolution # the network has to work with. to_draw = cv2.resize(resized, (int(self.input_shape[0]*vidar), self.input_shape[1])) # Use model to predict inputs = [image.img_to_array(rgb)] tmp_inp = np.array(inputs) x = preprocess_input(tmp_inp) y = self.model.predict(x) # This line creates a new TensorFlow device every time. Is there a # way to avoid that? results = self.bbox_util.detection_out(y) if len(results) > 0 and len(results[0]) > 0: # Interpret output, only one frame is used det_label = results[0][:, 0] det_conf = results[0][:, 1] det_xmin = results[0][:, 2] det_ymin = results[0][:, 3] det_xmax = results[0][:, 4] det_ymax = results[0][:, 5] top_indices = [i for i, conf in enumerate(det_conf) if conf >= conf_thresh] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] for i in range(top_conf.shape[0]): class_num = int(top_label_indices[i]) # 30%以上と判定した場合 if (top_conf[i] > 0.3 and (self.class_names[class_num] == 'person')): xmin = int(round(top_xmin[i] * to_draw.shape[1])) ymin = int(round(top_ymin[i] * to_draw.shape[0])) xmax = int(round(top_xmax[i] * to_draw.shape[1])) ymax = int(round(top_ymax[i] * to_draw.shape[0])) # 検出対象にモザイク処理をする to_draw[ymin:ymax, xmin:xmax] = mosaic_area(to_draw, xmin, ymin, xmax, ymax) #検出のために拡大したサイズをもとに戻す to_draw = cv2.resize(to_draw,(int(vidw),int(vidh))) cv2.startWindowThread() #動画の表示 cv2.imshow("SSD result", to_draw) #動画の書き出し video.write(to_draw) k = cv2.waitKey(1) if k == ord('q'): break cv2.destroyAllWindows() print('finish') pbar.close() cap.release() video.release()
def run_camera(input_shape, ssd_model, action_class, clip_length, c3d): num_classes = 21 conf_thresh = 0.5 bbox_util = BBoxUtility(num_classes) class_colors = [] for i in range(0, num_classes): hue = 255 * i / num_classes col = np.zeros((1, 1, 3)).astype("uint8") col[0][0][0] = hue col[0][0][1] = 128 # Saturation col[0][0][2] = 255 # Value cvcol = cv2.cvtColor(col, cv2.COLOR_HSV2BGR) col = (int(cvcol[0][0][0]), int(cvcol[0][0][1]), int(cvcol[0][0][2])) class_colors.append(col) vid = cv2.VideoCapture(0) # Compute aspect ratio of video vidw = vid.get(cv2.CAP_PROP_FRAME_WIDTH) vidh = vid.get(cv2.CAP_PROP_FRAME_HEIGHT) # vidar = vidw / vidh empty_count = 0 origin_stack = [] while True: retval, orig_image = vid.read() if not retval: print("Done!") return None im_size = (input_shape[0], input_shape[1]) resized = cv2.resize(orig_image, im_size) rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) inputs = [image.img_to_array(rgb)] tmp_inp = np.array(inputs) x = preprocess_input(tmp_inp) y = ssd_model.predict(x) results = bbox_util.detection_out(y) if len(results) > 0 and len(results[0]) > 0: det_label = results[0][:, 0] det_conf = results[0][:, 1] det_xmin = results[0][:, 2] det_ymin = results[0][:, 3] det_xmax = results[0][:, 4] det_ymax = results[0][:, 5] top_indices = [ i for i, conf in enumerate(det_conf) if conf >= conf_thresh ] top_conf = det_conf[top_indices] top_label_indices = det_label[top_indices].tolist() top_xmin = det_xmin[top_indices] top_ymin = det_ymin[top_indices] top_xmax = det_xmax[top_indices] top_ymax = det_ymax[top_indices] if 15 not in top_label_indices: empty_count += 1 if empty_count == 4: origin_stack = [] empty_count = 0 else: for i in range(top_conf.shape[0]): xmin = int(round((top_xmin[i] * vidw) * 0.9)) ymin = int(round((top_ymin[i] * vidh) * 0.9)) xmax = int(round( (top_xmax[i] * vidw) * 1.1)) if int(round( (top_xmax[i] * vidw) * 1.1)) <= vidw else int( round(top_xmax[i] * vidw)) ymax = int(round( (top_ymax[i] * vidh) * 1.1)) if int(round( (top_ymax[i] * vidh) * 1.1)) <= vidh else int( round(top_ymax[i] * vidh)) # save frames class_num = int(top_label_indices[i]) if class_num == 15: cv2.rectangle(orig_image, (xmin, ymin), (xmax, ymax), class_colors[class_num], 2) frame = orig_image curl = np.zeros_like(frame, dtype='uint8') curl[ymin:ymax, xmin:xmax, :] = frame[ymin:ymax, xmin:xmax, :] curl = cv2.resize(curl, (171, 128)) if len(origin_stack) < clip_length: origin_stack.append(curl[8:120, 30:142, :]) if len(origin_stack) == clip_length: origin_stack.pop(0) origin_stack.append(curl[8:120, 30:142, :]) clip = np.array(origin_stack) clip = np.expand_dims(clip, axis=0) clip = preprocessing(clip) c3d_result = c3d.predict(clip) if max(c3d_result[0]) >= conf_thresh: label = np.argmax(c3d_result[0]) action_name = action_class[label] cv2.putText( orig_image, action_name + '%.2f' % max(c3d_result[0]), (xmin + 10, ymin + 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 1) cv2.imshow("SSD result", orig_image) if cv2.waitKey(5) & 0xFF == ord('q'): break