def run_network(self, img: np.ndarray): """ Runs an image through the network + postprocessing and returns the masks and bboxes Args: img (np.ndarray): The image to process. Returns: (tuple): the masks and bboxes """ # Run image through the network img_gpu = torch.from_numpy(img).cuda().float() batch = FastBaseTransform()(img_gpu.unsqueeze(0)) preds = self.net(batch) h, w, _ = img.shape # Post process t = postprocess(preds, w, h, visualize_lincomb=True, crop_masks=True, score_threshold=0.15) top_k = 15 # Further restrict the number of predictions to parse idx = t[1].argsort(0, descending=True)[:top_k] masks = t[3][idx].cpu().numpy() classes, scores, boxes = [x[idx].cpu().numpy() for x in t[:3]] return masks, boxes
def segmentation(self, img): with torch.no_grad(): h, w, _ = img.shape frame = torch.from_numpy(img).cuda().float() batch = FastBaseTransform()(frame.unsqueeze(0)) preds = self.net(batch) classes, scores, boxes, masks = yolact_module.prep_display( 5, preds, frame, 0.5, h, w, undo_transform=True, class_color=False, mask_alpha=0.45, fps_str='') if not len(masks): return np.zeros((img.shape[0], img.shape[1])) mask = masks[0] mask = mask.cpu().numpy() h, w = mask.shape filled_mask = np.zeros([h, w]) contours = yolact_module.cv_contours(np.uint8(mask)) C = len(contours) contours = sorted(contours, key=lambda x: cv2.contourArea(x)) cv2.drawContours(filled_mask, contours, C - 1, 255, thickness=-1) #Fills the biggest contour return filled_mask
def evalimage(self): cv_img = self.get_data() #frame = torch.from_numpy(cv2.imread(path)).cuda().float() frame = torch.from_numpy(cv_img).cuda().float() batch = FastBaseTransform()(frame.unsqueeze(0)) preds = self.net(batch) self.prep_display(preds, frame, None, None, undo_transform=False)
def evalimage(net: Yolact, path: str, save_path: str = None): frame = torch.from_numpy(cv2.imread(path)).cuda().float() batch = FastBaseTransform()(frame.unsqueeze(0)) preds = net(batch) img_numpy = prep_display(preds, frame, None, None, net.cfg, undo_transform=False) if save_path is None: img_numpy = img_numpy[:, :, (2, 1, 0)] cv2.imwrite(save_path, img_numpy)
def prediction(self, img): self.net.detect.cross_class_nms = True cfg.mask_proto_debug = False with torch.no_grad(): frame = torch.Tensor(img).cuda().float() batch = FastBaseTransform()(frame.unsqueeze(0)) time_start = time.clock() preds = self.net(batch) h, w, _ = img.shape t = postprocess(preds, w, h, visualize_lincomb=False, crop_masks=True, score_threshold=self.threshold) torch.cuda.synchronize() masks = t[3][:self.top_k] classes, scores, bboxes = [ x[:self.top_k].cpu().numpy() for x in t[:3] ] time_elapsed = (time.clock() - time_start) num_dets_to_consider = min(self.top_k, classes.shape[0]) for i in range(num_dets_to_consider): if scores[i] < self.threshold: num_dets_to_consider = i break if num_dets_to_consider >= 1: masks = masks[:num_dets_to_consider, :, :, None] masks_msg = masks.cpu().detach().numpy() masks_msg = masks_msg.astype(np.uint8) scores_msg = np.zeros(num_dets_to_consider) class_label_msg = np.empty(num_dets_to_consider, dtype="S20") bboxes_msg = np.zeros([num_dets_to_consider, 4], dtype=int) for i in reversed(range(num_dets_to_consider)): scores_msg[i] = scores[i] class_label_msg[i] = cfg.dataset.class_names[classes[i]] bboxes_msg[i] = bboxes[i] print(class_label_msg[i].decode(), "%.2f" % (scores_msg[i])) os.system('cls' if os.name == 'nt' else 'clear') print("%.2f" % (1 / time_elapsed), "hz") if self.display_cv: self.display(frame, masks, classes, scores, bboxes, num_dets_to_consider) return masks_msg, class_label_msg, scores_msg, bboxes_msg
def evalimage(net: Yolact, path: str, save_path: str = None): frame = torch.from_numpy(cv2.imread(path)).cuda().float() batch = FastBaseTransform(with_cuda=net.with_cuda)(frame.unsqueeze(0)) preds = net(batch) img_numpy = prep_display(preds, frame, None, None, undo_transform=False) if save_path is None: img_numpy = img_numpy[:, :, (2, 1, 0)] if save_path is None: plt.imshow(img_numpy) plt.title(path) plt.show() else: cv2.imwrite(save_path, img_numpy)
def predict(self, image_array: np.ndarray): """ :image_path : image numpy array Format of returned boxes is [x1,y1,x2,y2], individual centers are tuples :return entire mask, individual masks, boxes, centers """ with torch.no_grad(): torch.set_default_tensor_type('torch.cuda.FloatTensor') frame = torch.from_numpy(image_array).cuda().float() batch = FastBaseTransform()(frame.unsqueeze(0)) net = Yolact() net.detect.use_fast_nms = True net.detect.use_cross_class_nms = True net.load_weights(self.weights) net.eval() preds = net(batch) mask_entire, boxes = prep_display(preds, frame, None, None, undo_transform=False) if len(boxes) < 1: return mask_entire, None, None, None mask_dict = {} centers_dict = {} boxes_dict = {} for index in range(len(boxes)): current_box = boxes[index] mask_dict[index] = mask_entire[current_box[1]:current_box[3], current_box[0]:current_box[2]] center = Segment.find_center(mask_dict[index]) if not center: adjusted_center = None else: adjusted_center = Segment.adjust_centers( center, current_box) centers_dict[index] = adjusted_center boxes_dict[index] = current_box return mask_entire, mask_dict, centers_dict, boxes_dict
def evalimage(self, path:str=None, save_path:str=None): cv_img = self.get_data() #frame = torch.from_numpy(cv2.imread(path)).cuda().float() frame = torch.from_numpy(cv_img).cuda().float() batch = FastBaseTransform()(frame.unsqueeze(0)) preds = self.net(batch) img_numpy = self.prep_display(preds, frame, None, None, undo_transform=False) if save_path is None: img_numpy = img_numpy[:, :, (2, 1, 0)] if save_path is None: plt.imshow(img_numpy) plt.title(path) plt.show() else: cv2.imwrite(save_path, img_numpy) try: self.image_pub.publish(self.bridge.cv2_to_imgmsg(img_numpy, "bgr8")) except CvBridgeError as e: print(e)
def evalvideo(self, net: Yolact, path: str): # If the path is a digit, parse it as a webcam index is_webcam = path.isdigit() if is_webcam: vid = cv2.VideoCapture(int(path)) else: vid = cv2.VideoCapture(path) if not vid.isOpened(): print('Could not open video "%s"' % path) exit(-1) net = CustomDataParallel(net).cuda() transform = torch.nn.DataParallel(FastBaseTransform()).cuda() frame_times = MovingAverage(100) fps = 0 # The 0.8 is to account for the overhead of time.sleep frame_time_target = 1 / vid.get(cv2.CAP_PROP_FPS) running = True def cleanup_and_exit(): print() pool.terminate() vid.release() cv2.destroyAllWindows() exit() def get_next_frame(vid): return [vid.read()[1] for _ in range(args.video_multiframe)] def transform_frame(frames): with torch.no_grad(): frames = [ torch.from_numpy(frame).cuda().float() for frame in frames ] return frames, transform(torch.stack(frames, 0)) def eval_network(inp): with torch.no_grad(): frames, imgs = inp return frames, net(imgs) def prep_frame(inp): with torch.no_grad(): frame, preds = inp return self.prep_display(preds, frame, None, None, undo_transform=False, class_color=True) frame_buffer = Queue() video_fps = 0 # All this timing code to make sure that def play_video(): nonlocal frame_buffer, running, video_fps, is_webcam video_frame_times = MovingAverage(100) frame_time_stabilizer = frame_time_target last_time = None stabilizer_step = 0.0005 while running: frame_time_start = time.time() if not frame_buffer.empty(): next_time = time.time() if last_time is not None: video_frame_times.add(next_time - last_time) video_fps = 1 / video_frame_times.get_avg() cv2.imshow(path, frame_buffer.get()) last_time = next_time #self.image_pub.publish(self.bridge.cv2_to_imgmsg(frame_buffer.get(), "bgr8")) if cv2.waitKey(1) == 27: # Press Escape to close running = False buffer_size = frame_buffer.qsize() if buffer_size < args.video_multiframe: frame_time_stabilizer += stabilizer_step elif buffer_size > args.video_multiframe: frame_time_stabilizer -= stabilizer_step if frame_time_stabilizer < 0: frame_time_stabilizer = 0 new_target = frame_time_stabilizer if is_webcam else max( frame_time_stabilizer, frame_time_target) next_frame_target = max( 2 * new_target - video_frame_times.get_avg(), 0) target_time = frame_time_start + next_frame_target - 0.001 # Let's just subtract a millisecond to be safe # This gives more accurate timing than if sleeping the whole amount at once while time.time() < target_time: time.sleep(0.001) extract_frame = lambda x, i: (x[0][i] if x[1][i] is None else x[0][i]. to(x[1][i]['box'].device), [x[1][i]]) # Prime the network on the first frame because I do some thread unsafe things otherwise print('Initializing model... ', end='') eval_network(transform_frame(get_next_frame(vid))) print('Done.') # For each frame the sequence of functions it needs to go through to be processed (in reversed order) sequence = [prep_frame, eval_network, transform_frame] pool = ThreadPool(processes=len(sequence) + args.video_multiframe + 2) pool.apply_async(play_video) active_frames = [] print() while vid.isOpened() and running: start_time = time.time() # Start loading the next frames from the disk next_frames = pool.apply_async(get_next_frame, args=(vid, )) # For each frame in our active processing queue, dispatch a job # for that frame using the current function in the sequence for frame in active_frames: frame['value'] = pool.apply_async(sequence[frame['idx']], args=(frame['value'], )) # For each frame whose job was the last in the sequence (i.e. for all final outputs) for frame in active_frames: if frame['idx'] == 0: frame_buffer.put(frame['value'].get()) # Remove the finished frames from the processing queue active_frames = [x for x in active_frames if x['idx'] > 0] # Finish evaluating every frame in the processing queue and advanced their position in the sequence for frame in list(reversed(active_frames)): frame['value'] = frame['value'].get() frame['idx'] -= 1 if frame['idx'] == 0: # Split this up into individual threads for prep_frame since it doesn't support batch size active_frames += [{ 'value': extract_frame(frame['value'], i), 'idx': 0 } for i in range(1, args.video_multiframe)] frame['value'] = extract_frame(frame['value'], 0) # Finish loading in the next frames and add them to the processing queue active_frames.append({ 'value': next_frames.get(), 'idx': len(sequence) - 1 }) # Compute FPS frame_times.add(time.time() - start_time) fps = args.video_multiframe / frame_times.get_avg() print( '\rProcessing FPS: %.2f | Video Playback FPS: %.2f | Frames in Buffer: %d ' % (fps, video_fps, frame_buffer.qsize()), end='') cleanup_and_exit()
def evalvideo(net: Yolact, path: str, out_path: str = None): # If the path is a digit, parse it as a webcam index is_webcam = path.isdigit() # If the input image size is constant, this make things faster (hence why we can use it in a video setting). cudnn.benchmark = True if is_webcam: vid = cv2.VideoCapture(int(path)) else: vid = cv2.VideoCapture(path) if not vid.isOpened(): print('Could not open video "%s"' % path) exit(-1) target_fps = round(vid.get(cv2.CAP_PROP_FPS)) frame_width = round(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) frame_height = round(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) if is_webcam: num_frames = float('inf') else: num_frames = round(vid.get(cv2.CAP_PROP_FRAME_COUNT)) net = CustomDataParallel(net).cuda() transform = torch.nn.DataParallel( FastBaseTransform(with_cuda=net.with_cuda)) if net.with_cuda: transform = transform.cuda() frame_times = MovingAverage(100) fps = 0 frame_time_target = 1 / target_fps running = True fps_str = '' vid_done = False frames_displayed = 0 if out_path is not None: out = cv2.VideoWriter(out_path, cv2.VideoWriter_fourcc(*"mp4v"), target_fps, (frame_width, frame_height)) def cleanup_and_exit(): print() pool.terminate() vid.release() if out_path is not None: out.release() cv2.destroyAllWindows() exit() def get_next_frame(vid): frames = [] for idx in range(args.video_multiframe): frame = vid.read()[1] if frame is None: return frames frames.append(frame) return frames def transform_frame(frames): with torch.no_grad(): frames = [ torch.from_numpy(frame).cuda().float() for frame in frames ] return frames, transform(torch.stack(frames, 0)) def eval_network(inp): with torch.no_grad(): frames, imgs = inp num_extra = 0 while imgs.size(0) < args.video_multiframe: imgs = torch.cat([imgs, imgs[0].unsqueeze(0)], dim=0) num_extra += 1 out = net(imgs) if num_extra > 0: out = out[:-num_extra] return frames, out def prep_frame(inp, fps_str): with torch.no_grad(): frame, preds = inp return prep_display(preds, frame, None, None, undo_transform=False, class_color=True, fps_str=fps_str) frame_buffer = Queue() video_fps = 0 # All this timing code to make sure that def play_video(): try: nonlocal frame_buffer, running, video_fps, is_webcam, num_frames, frames_displayed, vid_done video_frame_times = MovingAverage(100) frame_time_stabilizer = frame_time_target last_time = None stabilizer_step = 0.0005 progress_bar = ProgressBar(30, num_frames) while running: frame_time_start = time.time() if not frame_buffer.empty(): next_time = time.time() if last_time is not None: video_frame_times.add(next_time - last_time) video_fps = 1 / video_frame_times.get_avg() if out_path is None: cv2.imshow(path, frame_buffer.get()) else: out.write(frame_buffer.get()) frames_displayed += 1 last_time = next_time if out_path is not None: if video_frame_times.get_avg() == 0: fps = 0 else: fps = 1 / video_frame_times.get_avg() progress = frames_displayed / num_frames * 100 progress_bar.set_val(frames_displayed) print( '\rProcessing Frames %s %6d / %6d (%5.2f%%) %5.2f fps ' % (repr(progress_bar), frames_displayed, num_frames, progress, fps), end='') # This is split because you don't want savevideo to require cv2 display functionality (see #197) if out_path is None and cv2.waitKey(1) == 27: # Press Escape to close running = False if not (frames_displayed < num_frames): running = False if not vid_done: buffer_size = frame_buffer.qsize() if buffer_size < args.video_multiframe: frame_time_stabilizer += stabilizer_step elif buffer_size > args.video_multiframe: frame_time_stabilizer -= stabilizer_step if frame_time_stabilizer < 0: frame_time_stabilizer = 0 new_target = frame_time_stabilizer if is_webcam else max( frame_time_stabilizer, frame_time_target) else: new_target = frame_time_target next_frame_target = max( 2 * new_target - video_frame_times.get_avg(), 0) target_time = frame_time_start + next_frame_target - 0.001 # Let's just subtract a millisecond to be safe if out_path is None or args.emulate_playback: # This gives more accurate timing than if sleeping the whole amount at once while time.time() < target_time: time.sleep(0.001) else: # Let's not starve the main thread, now time.sleep(0.001) except: # See issue #197 for why this is necessary import traceback traceback.print_exc() extract_frame = lambda x, i: (x[0][i] if x[1][i]['detection'] is None else x[0][i].to(x[1][i]['detection']['box'].device ), [x[1][i]]) # Prime the network on the first frame because I do some thread unsafe things otherwise print('Initializing model... ', end='') first_batch = eval_network(transform_frame(get_next_frame(vid))) print('Done.') # For each frame the sequence of functions it needs to go through to be processed (in reversed order) sequence = [prep_frame, eval_network, transform_frame] pool = ThreadPool(processes=len(sequence) + args.video_multiframe + 2) pool.apply_async(play_video) active_frames = [{ 'value': extract_frame(first_batch, i), 'idx': 0 } for i in range(len(first_batch[0]))] print() if out_path is None: print('Press Escape to close.') try: while vid.isOpened() and running: # Hard limit on frames in buffer so we don't run out of memory >.> while frame_buffer.qsize() > 100: time.sleep(0.001) start_time = time.time() # Start loading the next frames from the disk if not vid_done: next_frames = pool.apply_async(get_next_frame, args=(vid, )) else: next_frames = None if not (vid_done and len(active_frames) == 0): # For each frame in our active processing queue, dispatch a job # for that frame using the current function in the sequence for frame in active_frames: _args = [frame['value']] if frame['idx'] == 0: _args.append(fps_str) frame['value'] = pool.apply_async(sequence[frame['idx']], args=_args) # For each frame whose job was the last in the sequence (i.e. for all final outputs) for frame in active_frames: if frame['idx'] == 0: frame_buffer.put(frame['value'].get()) # Remove the finished frames from the processing queue active_frames = [x for x in active_frames if x['idx'] > 0] # Finish evaluating every frame in the processing queue and advanced their position in the sequence for frame in list(reversed(active_frames)): frame['value'] = frame['value'].get() frame['idx'] -= 1 if frame['idx'] == 0: # Split this up into individual threads for prep_frame since it doesn't support batch size active_frames += [{ 'value': extract_frame(frame['value'], i), 'idx': 0 } for i in range(1, len(frame['value'][0]))] frame['value'] = extract_frame(frame['value'], 0) # Finish loading in the next frames and add them to the processing queue if next_frames is not None: frames = next_frames.get() if len(frames) == 0: vid_done = True else: active_frames.append({ 'value': frames, 'idx': len(sequence) - 1 }) # Compute FPS frame_times.add(time.time() - start_time) fps = args.video_multiframe / frame_times.get_avg() else: fps = 0 fps_str = 'Processing FPS: %.2f | Video Playback FPS: %.2f | Frames in Buffer: %d' % ( fps, video_fps, frame_buffer.qsize()) if not args.display_fps: print('\r' + fps_str + ' ', end='') except KeyboardInterrupt: print('\nStopping...') cleanup_and_exit()
color_image = np.asanyarray(color_frame.get_data()) # r, g, b = cv2.split(color_image) # color_image = cv2.merge((b, g, r)) cv2.imshow('color_image', color_image) aligned_depth_frame = aligned_frames.get_depth_frame() aligned_depth_image = np.asanyarray(aligned_depth_frame.get_data()) # img = Image.open(folder_path + '/color_image' + str(now) + '.png') # depth = np.array(Image.open(folder_path + '/depth_image' + str(now) + '.png')) img = color_image depth = aligned_depth_image frame = torch.from_numpy(np.array(img)).cuda().float() batch = FastBaseTransform()(frame.unsqueeze(0)) preds = yolact(batch) try: masks, classes, boxes, img_numpy = prep_display( preds, frame, None, None, undo_transform=False) except: print("Yolact exception occur!") continue # print(classes) # print(boxes) # # plt.imshow(img_numpy) # plt.title("pred") # plt.show()
def process(self, image: np.ndarray, pos: int): """:returns (classes, scores, boxes) where `boxes` is an array of bounding boxes of detected objects in (xleft, ytop, width, height) format. `classes` is the class ids of the corresponding objects. `scores` are the computed class scores corresponding to the detected objects. Roughly high score indicates strong belief that the object belongs to the identified class. """ _ts = time.perf_counter() logging.debug(f'Received frame {pos}') if self.net is None: self.sigError.emit(YolactException('Network not initialized')) return # Partly follows yolact eval.py tic = time.perf_counter_ns() _ = qc.QMutexLocker(self.mutex) with torch.no_grad(): if self.cuda: image = torch.from_numpy(image).cuda().float() else: image = torch.from_numpy(image).float() batch = FastBaseTransform()(image.unsqueeze(0)) preds = self.net(batch) image_gpu = image / 255.0 h, w, _ = image.shape save = self.config.rescore_bbox self.config.rescore_bbox = True classes, scores, boxes, masks = oututils.postprocess( preds, w, h, visualize_lincomb=False, crop_masks=True, score_threshold=self.score_threshold) idx = scores.argsort(0, descending=True)[:self.top_k] # if self.config.eval_mask_branch: # masks = masks[idx] classes, scores, boxes = [ x[idx].cpu().numpy() for x in (classes, scores, boxes) ] # This is probably not required, `postprocess` uses # `score_thresh` already num_dets_to_consider = min(self.top_k, classes.shape[0]) for j in range(num_dets_to_consider): if scores[j] < self.score_threshold: num_dets_to_consider = j break # logging.debug('Bounding boxes: %r', boxes) # Convert from top-left bottom-right format to # top-left, width, height format if len(boxes) == 0: self.sigProcessed.emit(boxes, pos) return boxes[:, 2:] = boxes[:, 2:] - boxes[:, :2] boxes = np.asanyarray(boxes, dtype=np.int_) if self.overlap_thresh < 1: dist_matrix = pairwise_distance(new_bboxes=boxes, bboxes=boxes, boxtype=OutlineStyle.bbox, metric=DistanceMetric.ios) bad_idx = [jj for ii in range(dist_matrix.shape[0] - 1) \ for jj in range(ii+1, dist_matrix.shape[1]) \ if dist_matrix[ii, jj] < 1 - self.overlap_thresh] good_idx = list(set(range(boxes.shape[0])) - set(bad_idx)) boxes = boxes[good_idx].copy() toc = time.perf_counter_ns() logging.debug('Time to process single _image: %f s', 1e-9 * (toc - tic)) self.sigProcessed.emit(boxes, pos) logging.debug(f'Emitted bboxes for frame {pos}: {boxes}') _dt = time.perf_counter() - _ts logging.debug( f'{__name__}.{self.__class__.__name__}.process: Runtime: {_dt}s')
def segment_yolact(frame, score_threshold, top_k, overlap_thresh, cfgfile, netfile, cuda): """Segment objects in frame using YOLACT. Parameters ---------- frame: numpy.ndarray (WxHxC) integer array with the image content. score_threshold: float Minimum score to include object, should be in `(0, 1)`. top_k: int The number of segmented objects to keep. overlap_thresh: float Merge objects whose bounding boxes overlap (intersection over union) more than this amount. cfgfile: str Path to YOLACT configuration file. netfile: str Path to YOLACT network weights file. cuda: bool Whether to use CUDA. Returns ------- numpy.ndarray An array of bounding boxes of detected objects in (xleft, ytop, width, height) format. """ global ynet global config if ynet is None: init_yolact(cfgfile, netfile, cuda) # Partly follows yolact eval.py tic = time.perf_counter_ns() with torch.no_grad(): if cuda: frame = torch.from_numpy(frame).cuda().float() else: frame = torch.from_numpy(frame).float() batch = FastBaseTransform()(frame.unsqueeze(0)) preds = ynet(batch) h, w, _ = frame.shape config.rescore_bbox = True classes, scores, boxes, masks = oututils.postprocess( preds, w, h, visualize_lincomb=False, crop_masks=True, score_threshold=score_threshold) idx = scores.argsort(0, descending=True)[:top_k] # if self.config.eval_mask_branch: # masks = masks[idx] classes, scores, boxes = [x[idx].cpu().numpy() for x in (classes, scores, boxes)] # This is probably not required, `postprocess` uses # `score_thresh` already # num_dets_to_consider = min(self.top_k, classes.shape[0]) # for j in range(num_dets_to_consider): # if scores[j] < self.score_threshold: # num_dets_to_consider = j # break # logging.debug('Bounding boxes: %r', boxes) # Convert from top-left bottom-right format to # top-left, width, height format if len(boxes) == 0: return np.empty(0) boxes[:, 2:] = boxes[:, 2:] - boxes[:, :2] boxes = np.asanyarray(np.rint(boxes), dtype=np.int_) if overlap_thresh < 1: dist_matrix = ut.pairwise_distance(new_bboxes=boxes, bboxes=boxes, boxtype=OutlineStyle.bbox, metric=DistanceMetric.iou) bad_boxes = [] for ii in range(dist_matrix.shape[0] - 1): for jj in range(ii + 1, dist_matrix.shape[1]): if dist_matrix[ii, jj] < 1 - overlap_thresh: bad_boxes.append(jj) boxes = np.array([boxes[ii] for ii in range(boxes.shape[0]) if ii not in bad_boxes], dtype=np.int_) toc = time.perf_counter_ns() logging.debug('Time to process single image: %f s', 1e-9 * (toc - tic)) return boxes