def __init__(self, IP, cameraId=vd.kTopCamera, resolution=vd.kVGA): super(ObjectDetection, self).__init__(IP, cameraId, resolution) self._boundRect = [] self._cropKeep = 1 self._stickAngle = None # rad #self._classes_name = ["stick"] self._common_params = {'image_size': 448, 'num_classes': 1, 'batch_size':1} self._net_params = {'cell_size': 7, 'boxes_per_cell':2, 'weight_decay': 0.0005} self._net = YoloTinyNet(self._common_params, self._net_params, test=True) #self._modelFile = "/home/meringue/Documents/python-nao-golf/yoloNet/models/train/model.ckpt-95000" #self._objectRect = [0, 0, 0, 0] self._objectName = None
def __init__(self, IP, classes_name, cameraId=vd.kTopCamera, resolution=vd.kVGA): super(MultiObjectDetection, self).__init__(IP, cameraId, resolution) self._classes_name = classes_name self._num_classes = len(classes_name) self._common_params = { 'image_size': 448, 'num_classes': self._num_classes, 'batch_size': 1 } self._net_params = { 'cell_size': 7, 'boxes_per_cell': 2, 'weight_decay': 0.0005 } self._net = YoloTinyNet(self._common_params, self._net_params, test=True)
def main(): net = YoloTinyNet(common_params, net_params, test=True) # tensorflow中声明占位符号image, 这在后面run的时候 # feed_dict中会出现该占位符和对应的值,意思就是输入数据的来源 image = tf.placeholder(tf.float32, (1, 448, 448, 3)) predicts = net.inference(image) sess = tf.Session() # 转化数据格式 np_img = cv2.imread('cat.jpg') resized_img = cv2.resize(np_img, (448, 448)) np_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB) np_img = np_img.astype(np.float32) #白化输入的数据 np_img = np_img / 255.0 * 2 - 1 np_img = np.reshape(np_img, (1, 448, 448, 3)) saver = tf.train.Saver(net.trainable_collection) saver.restore(sess, 'models/pretrain/yolo_tiny.ckpt') # The optional feed_dict argument allows the caller to override # the value of tensors in the graph. np_predict = sess.run(predicts, feed_dict={image: np_img}) xmin, ymin, xmax, ymax, class_num = process_predicts(np_predict) class_name = classes_name[class_num] # 绘制预测框, 输出预测类型 cv2.rectangle(resized_img, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0, 0, 255)) cv2.putText(resized_img, class_name, (int(xmin), int(ymin)), 2, 1.5, (0, 0, 255)) cv2.imwrite('cat_out.jpg', resized_img) sess.close()
src_xmax = xmax * width_ratio src_ymax = ymax * height_ratio score = float("%.3f" %score) cv2.rectangle(src_img, (int(src_xmin), int(src_ymin)), (int(src_xmax), int(src_ymax)), (0, 0, 255)) cv2.putText(src_img, object_name + str(score), (int(src_xmin), int(src_ymin)), 1, 2, (0, 0, 255)) #cv2.imshow("result", src_img) cv2.imwrite("result.jpg", src_img) if __name__ == '__main__': common_params = {'image_size': 448, 'num_classes': 20, 'batch_size': 1} net_params = {'cell_size': 7, 'boxes_per_cell': 2, 'weight_decay': 0.0005} net = YoloTinyNet(common_params, net_params, test=True) image = tf.placeholder(tf.float32, (1, 448, 448, 3)) predicts = net.yoloTinyModel(image) os.environ["CUDA_VISIBLE_DEVICES"] = "1" config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) src_img = cv2.imread("./test2.jpg") #src_img = cv2.imread("./data/VOCdevkit2007/VOC2007/JPEGImages/000058.jpg") resized_img = cv2.resize(src_img, (448, 448)) #height_ratio = src_img.shape[0]/448.0 #width_ratio = src_img.shape[1]/448.0
w = w * x_size h = h * y_size xmin = xcenter - w / 2.0 ymin = ycenter - h / 2.0 xmax = xmin + w ymax = ymin + h return xmin, ymin, xmax, ymax, class_num common_params = {'image_size': x_size, 'num_classes': 20, 'batch_size': 1} net_params = {'cell_size': 7, 'boxes_per_cell': 2, 'weight_decay': 0.0005} net = YoloTinyNet(common_params, net_params, test=True) image = tf.placeholder(tf.float32, (1, x_size, y_size, channel)) predicts = net.inference(image) saver = tf.train.Saver(net.trainable_collection) with tf.Session() as sess: saver.restore(sess, model_path) forderlist = os.listdir(directory) filecnt = 0 for forder in forderlist: filelist = os.listdir(directory + '/' + forder) for filename in filelist: # PNG -> JPEG img = Image.open(directory + '/' + forder + '/' + filename)
w = w * 448 h = h * 448 xmin = xcenter - w/2.0 ymin = ycenter - h/2.0 xmax = xmin + w ymax = ymin + h return xmin, ymin, xmax, ymax, class_num common_params = {'image_size': 448, 'num_classes': 20, 'batch_size':1} net_params = {'cell_size': 7, 'boxes_per_cell':2, 'weight_decay': 0.0005} net = YoloTinyNet(common_params, net_params, test=True) image = tf.placeholder(tf.float32, (1, 448, 448, 3)) predicts = net.inference(image) sess = tf.Session() np_img = cv2.imread('dining_table.jpg') resized_img = cv2.resize(np_img, (448, 448)) np_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB) np_img = np_img.astype(np.float32) np_img = np_img / 255.0 * 2 - 1 np_img = np.reshape(np_img, (1, 448, 448, 3))
class MultiObjectDetection(VisualBasis): def __init__(self, IP, classes_name, cameraId=vd.kTopCamera, resolution=vd.kVGA): super(MultiObjectDetection, self).__init__(IP, cameraId, resolution) self._classes_name = classes_name self._num_classes = len(classes_name) self._common_params = { 'image_size': 448, 'num_classes': self._num_classes, 'batch_size': 1 } self._net_params = { 'cell_size': 7, 'boxes_per_cell': 2, 'weight_decay': 0.0005 } self._net = YoloTinyNet(self._common_params, self._net_params, test=True) def predict_object(self, image): predicts = self._net.inference(image) return predicts def process_predicts(self, resized_img, predicts, thresh=0.2): """ process the predicts of object detection with one image input. Args: resized_img: resized source image. predicts: output of the model. thresh: thresh of bounding box confidence. Return: predicts_dict: {"stick": [[x1, y1, x2, y2, scores1], [...]]}. """ cls_num = self._num_classes bbx_per_cell = self._net_params["boxes_per_cell"] cell_size = self._net_params["cell_size"] img_size = self._common_params["image_size"] p_classes = predicts[0, :, :, 0:cls_num] C = predicts[0, :, :, cls_num:cls_num + bbx_per_cell] # two bounding boxes in one cell. coordinate = predicts[0, :, :, cls_num + bbx_per_cell:] # all bounding boxes position. p_classes = np.reshape(p_classes, (cell_size, cell_size, 1, cls_num)) C = np.reshape(C, (cell_size, cell_size, bbx_per_cell, 1)) P = C * p_classes # confidencefor all classes of all bounding boxes (cell_size, cell_size, bounding_box_num, class_num) = (7, 7, 2, 1). predicts_dict = {} for i in range(cell_size): for j in range(cell_size): temp_data = np.zeros_like(P, np.float32) temp_data[i, j, :, :] = P[i, j, :, :] position = np.argmax( temp_data ) # refer to the class num (with maximum confidence) for every bounding box. index = np.unravel_index(position, P.shape) if P[index] > thresh: class_num = index[-1] coordinate = np.reshape( coordinate, (cell_size, cell_size, bbx_per_cell, 4) ) # (cell_size, cell_size, bbox_num_per_cell, coordinate)[xmin, ymin, xmax, ymax] max_coordinate = coordinate[index[0], index[1], index[2], :] xcenter = max_coordinate[0] ycenter = max_coordinate[1] w = max_coordinate[2] h = max_coordinate[3] xcenter = (index[1] + xcenter) * (1.0 * img_size / cell_size) ycenter = (index[0] + ycenter) * (1.0 * img_size / cell_size) w = w * img_size h = h * img_size xmin = 0 if (xcenter - w / 2.0 < 0) else (xcenter - w / 2.0) ymin = 0 if (xcenter - w / 2.0 < 0) else (ycenter - h / 2.0) xmax = resized_img.shape[0] if ( xmin + w) > resized_img.shape[0] else (xmin + w) ymax = resized_img.shape[1] if ( ymin + h) > resized_img.shape[1] else (ymin + h) class_name = self._classes_name[class_num] predicts_dict.setdefault(class_name, []) predicts_dict[class_name].append( [int(xmin), int(ymin), int(xmax), int(ymax), P[index]]) return predicts_dict def non_max_suppress(self, predicts_dict, threshold=0.5): """ implement non-maximum supression on predict bounding boxes. Args: predicts_dict: {"stick": [[x1, y1, x2, y2, scores1], [...]]}. threshhold: iou threshold Return: predicts_dict processed by non-maximum suppression """ for object_name, bbox in predicts_dict.items(): bbox_array = np.array(bbox, dtype=np.float) x1, y1, x2, y2, scores = bbox_array[:, 0], bbox_array[:, 1], bbox_array[:, 2], bbox_array[:, 3], bbox_array[:, 4] areas = (x2 - x1 + 1) * (y2 - y1 + 1) order = scores.argsort()[::-1] keep = [] while order.size > 0: i = order[0] keep.append(i) xx1 = np.maximum(x1[i], x1[order[1:]]) yy1 = np.maximum(y1[i], y1[order[1:]]) xx2 = np.minimum(x2[i], x2[order[1:]]) yy2 = np.minimum(y2[i], y2[order[1:]]) inter = np.maximum(0.0, xx2 - xx1 + 1) * np.maximum( 0.0, yy2 - yy1 + 1) iou = inter / (areas[i] + areas[order[1:]] - inter) indexs = np.where(iou <= threshold)[0] order = order[indexs + 1] bbox = bbox_array[keep] predicts_dict[object_name] = bbox.tolist() predicts_dict = predicts_dict return predicts_dict def plot_result(self, src_img, predicts_dict, save_name=None): """ plot bounding boxes on source image. Args: src_img: source image predicts_dict: {"stick": [[x1, y1, x2, y2, scores1], [...]]}. """ height_ratio = 1.0 * src_img.shape[0] / self._common_params[ "image_size"] width_ratio = 1.0 * src_img.shape[1] / self._common_params["image_size"] for object_name, bbox in predicts_dict.items(): for box in bbox: xmin, ymin, xmax, ymax, score = box src_xmin = xmin * width_ratio src_ymin = ymin * height_ratio src_xmax = xmax * width_ratio src_ymax = ymax * height_ratio score = float("%.3f" % score) cv2.rectangle(src_img, (int(src_xmin), int(src_ymin)), (int(src_xmax), int(src_ymax)), (0, 0, 255)) cv2.putText(src_img, object_name + str(score), (int(src_xmin), int(src_ymin)), 1, 2, (0, 0, 255)) cv2.imshow("result", src_img) if save_name is not None: cv2.imwrite(save_name, src_img) def object_track(self, predicts_dict, object_name="cup"): """track the specified object with maximum confidence. Args: object_name: object name. """ if self._motionProxy.getStiffnesses("Head") < 1.0: self._motionProxy.setStiffnesses("Head", 1.0) if self._motionProxy.getStiffnesses("LArm") < 1.0: self._motionProxy.setStiffnesses("LArm", 1.0) img_size = self._common_params["image_size"] img_center_x = img_size / 2 img_center_y = img_size / 2 if predicts_dict.has_key(object_name): predict_coords = predicts_dict[object_name] predict_coords.sort(key=lambda coord: coord[-1], reverse=True) predict_coord = predict_coords[0] xmin, ymin, xmax, ymax, _ = predict_coord center_x = (xmin + xmax) / 2 center_y = (ymin + ymax) / 2 angle_yaw = (center_x - img_center_x) / (img_size) * self._cameraYawRange angle_pitch = (center_y - img_center_y) / (img_size) * self._cameraPitchRange self._motionProxy.angleInterpolation( ["HeadPitch", "HeadYaw"], [0.8 * angle_pitch, -0.8 * angle_yaw], 0.5, False) head_pitch, head_yaw = self._motionProxy.getAngles("Head", False) arm_angle = [ head_yaw - 7 / 180 * np.pi, head_pitch, -1.15, -0.035, -1.54, 0.01 ] self._motionProxy.setAngles("LArm", arm_angle, 0.2) self._motionProxy.openHand("LHand")
xmax = xmin + w ymax = ymin + h #输出左上角和右下角坐标 return xmin, ymin, xmax, ymax, class_num, confidence common_params = {'image_size': 448, 'num_classes': 20, 'batch_size': 1} net_params = { 'cell_size': 7, 'boxes_per_cell': 2, 'weight_decay': 0.0005 } #网络结构参数 net = YoloTinyNet(common_params, net_params, test=True) #定义网络 传入字典 获得网络类对象 image = tf.placeholder(tf.float32, (1, 448, 448, 3)) # 图片大小 predicts = net.inference(image) # print(predicts.shape) sess = tf.Session() np_img = cv2.imread('dog.jpg') resized_img = cv2.resize(np_img, (448, 448)) np_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB) #颜色通道转换 np_img = np_img.astype(np.float32) np_img = np_img / 255.0 * 2 - 1 #预处理[-1,1] np_img = np.reshape(np_img, (1, 448, 448, 3))
class ObjectDetection(VisualBasis): def __init__(self, IP, cameraId=vd.kTopCamera, resolution=vd.kVGA): super(ObjectDetection, self).__init__(IP, cameraId, resolution) self._boundRect = [] self._cropKeep = 1 self._stickAngle = None # rad #self._classes_name = ["stick"] self._common_params = {'image_size': 448, 'num_classes': 1, 'batch_size':1} self._net_params = {'cell_size': 7, 'boxes_per_cell':2, 'weight_decay': 0.0005} self._net = YoloTinyNet(self._common_params, self._net_params, test=True) #self._modelFile = "/home/meringue/Documents/python-nao-golf/yoloNet/models/train/model.ckpt-95000" #self._objectRect = [0, 0, 0, 0] self._objectName = None def predict_single_object(self, image): predicts = self._net.inference(image) return predicts def process_predicts(self, predicts): p_classes = predicts[0, :, :, 0:1] C = predicts[0, :, :, 1:3] coordinate = predicts[0, :, :, 3:] p_classes = np.reshape(p_classes, (7, 7, 1, 1)) C = np.reshape(C, (7, 7, 2, 1)) P = C * p_classes index = np.argmax(P) print("confidence = ", np.max(P)) index = np.unravel_index(index, P.shape) class_num = index[3] coordinate = np.reshape(coordinate, (7, 7, 2, 4)) max_coordinate = coordinate[index[0], index[1], index[2], :] xcenter = max_coordinate[0] ycenter = max_coordinate[1] w = max_coordinate[2] h = max_coordinate[3] xcenter = (index[1] + xcenter) * (448/7.0) ycenter = (index[0] + ycenter) * (448/7.0) w = w * 448 h = h * 448 xmin = xcenter - w/2.0 ymin = ycenter - h/2.0 xmax = xmin + w ymax = ymin + h return [xmin, ymin, xmax, ymax], class_num def showDetectResult(self, frame, rect, object_name): object_min_xy = (int(rect[0]), int(rect[1])) object_max_xy = (int(rect[2]), int(rect[3])) cv2.rectangle(frame, object_min_xy, object_max_xy, (0, 0, 255)) cv2.putText(frame, object_name, object_min_xy, 2, 2, (0, 0, 255)) cv2.imshow("detect result", frame) #cv2.waitKey(10)