class TestYOLOv4SceneSensor(object): def setup_class(self): self.roi_feat_resolution = 5 self.scene_sensor = SceneSensor( YOLOv4_MODEL, gpu=0, img_shape=[3, 416, 416], roi_feat_resolution=self.roi_feat_resolution, algorithm='yolov4') self.frames = clip_video_to_frames(R2_VIDEO, 0., None) def test_get_instances(self, export=True): instances_lst = self.scene_sensor.get_instances(self.frames) assert len(instances_lst) == len(self.frames) if export: h, w, fps = 720, 1280, 24. # read from VIDEO video_writer = VideoWriter('data/scene_yolo4_demo.mp4', (w, h), fps) for frame, instances in zip(self.frames, instances_lst): bboxes = np.array([i['bbox'] for i in instances]) labels = [i['category'] for i in instances] frame_draw = draw_bboxes(frame, bboxes, labels=labels) video_writer.add_frame(frame_draw) video_writer.close() def test_get_instances_with_feats(self): instances_lst, fm_lst = self.scene_sensor.get_instances_with_feats( self.frames, get_full_fm=True) _, h, w = instances_lst[0][0]['fm'].shape assert h == w == self.roi_feat_resolution assert len(instances_lst) == len(fm_lst) == len(self.frames)
class TestYOLOv3SceneSensor(object): def setup_class(self): self.scene_sensor = SceneSensor(YOLOv3_MODEL, gpu=0, algorithm='yolov3') self.frames = clip_video_to_frames(VIDEO, 3001., 4000.) def test_get_instances(self, export=True): instances_lst = self.scene_sensor.get_instances(self.frames) assert len(instances_lst) == len(self.frames) if export: h, w, fps = 480, 640, 24. # read from VIDEO video_writer = VideoWriter('data/scene_yolo_demo.mp4', (w, h), fps) for frame, instances in zip(self.frames, instances_lst): bboxes = np.array([i['bbox'] for i in instances]) labels = [i['category'] for i in instances] frame_draw = draw_bboxes(frame, bboxes, labels=labels) video_writer.add_frame(frame_draw) video_writer.close() def test_get_feature_map(self): feature_maps = self.scene_sensor.get_feature_map(self.frames) assert len(feature_maps) == len(self.frames)
def setup_class(self): self.roi_feat_resolution = 5 self.scene_sensor = SceneSensor( YOLOv4_MODEL, gpu=0, img_shape=[3, 416, 416], roi_feat_resolution=self.roi_feat_resolution, algorithm='yolov4') self.frames = clip_video_to_frames(R2_VIDEO, 0., None)
def _build_detector_program(self): self.detector_prog = fluid.Program() self.detector_startup_prog = fluid.Program() with fluid.program_guard(self.detector_prog, self.detector_startup_prog): yolov4_detector = SceneSensor.network( self.input_shape, 'yolov4', get_roi_feat=False, roi_feat_resolution=self.roi_feat_resolution) feed_list, fetch_list = yolov4_detector.build() self.detector_feeds = [i.name for i in feed_list] self.detector_fetch = fetch_list
def worker_func(in_queue, out_queue, msg_queue, conf_dict): scene_sensor = SceneSensor( conf_dict['yolov4_model_dir'], gpu=conf_dict['gpu'], img_shape=[3, 416, 416], roi_feat_resolution=conf_dict['roi_feat_resolution'], algorithm='yolov4') while True: try: msg = msg_queue.get_nowait() except Empty: msg = '' if msg == 'stop': break try: anno = in_queue.get(timeout=5) except Empty: anno = None if anno is not None: if Enable_Time_Log: t1 = time.time() if 'Cache' not in anno: frames = [pickle.loads(i) for i in anno['Frames']] anno['Instances'] = scene_sensor.get_instances_with_feats( frames, get_full_fm=False) del anno['Frames'] # to save memory! out_queue.put(anno) if Enable_Time_Log: t2 = time.time() print('Detector takes {:.3f}s'.format(t2 - t1))
def run_worker(tasks, gpu_id, encoder_model, yolov4_model, output_dir, max_cosine_distance, resume): os.environ['CUDA_VISIBLE_DEVICES'] = gpu_id from perception.tracker.re_id import create_box_encoder, \ NearestNeighborDistanceMetric from perception.tracker.tracker import Tracker, Detection from perception.scene.eval import SceneSensor encoder = create_box_encoder(encoder_model, batch_size=8) metric = NearestNeighborDistanceMetric('cosine', max_cosine_distance, None) tracker = Tracker(metric) detector = SceneSensor(yolov4_model, gpu=0, img_shape=[3, 416, 416], algorithm='yolov4') for video_file in tasks: task_id = os.path.basename(video_file)[:-len('.mp4')] if resume is not None: if resume != task_id: continue else: resume = None clip = VideoFileClip(video_file) track_video = os.path.join(output_dir, '{}_track.mp4'.format(task_id)) video_writer = VideoWriter(track_video, (clip.w, clip.h), clip.fps) tracker_logs = [] for frame in clip.iter_frames(): frame = frame[:, :, ::-1] instances = detector.get_instances(frame)[0] boxes = [ins['bbox'] for ins in instances] features = encoder(frame, boxes) detections = [ Detection(ins, feat) for ins, feat in zip(instances, features) ] tracker.predict() tracker.update(detections) track_log = dict() for track in tracker.tracks: if not track.is_confirmed() or track.time_since_update > 1: continue bbox = track.to_tlbr() track_log[str(track.track_id)] = bbox # NOTE: https://github.com/opencv/opencv/issues/14866 # We have to add this line frame = np.array(frame) cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (255, 255, 255), 2) cv2.putText(frame, str(track.track_id), (int(bbox[0]), int(bbox[1] + 23)), 0, 5e-3 * 100, (0, 255, 0), 2) det_log = [] for det in detections: if str(det.cls) != 'person': continue bbox = det.to_tlbr() score = "%.2f" % round(det.confidence * 100, 2) + "%" det_log.append(bbox) # NOTE: https://github.com/opencv/opencv/issues/14866 # We have to add this line frame = np.array(frame) cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (255, 0, 0), 2) cv2.putText(frame, score, (int(bbox[0]), int(bbox[3])), 0, 5e-3 * 100, (0, 255, 0), 2) tracker_logs.append((track_log, det_log)) video_writer.add_frame(frame) video_writer.close() convert_to_h264(track_video) print('Saved {}'.format(track_video)) tracker_logs_file = os.path.join(output_dir, '{}_states.pkl'.format(task_id)) with open(tracker_logs_file, 'wb') as f: pickle.dump(tracker_logs, f) print('Saved {}'.format(tracker_logs_file)) tracker.reset()
class SalutationClsDataset(object): def __init__(self, video_tracking_dir, anno_dir, yolov4_model_dir, roi_feat_resolution=5, gpu=0): self.video_tracking_dir = video_tracking_dir self.anno_dir = anno_dir self.yolov4_model_dir = yolov4_model_dir self.roi_feat_resolution = roi_feat_resolution self.gpu = gpu self._collect_annotations() self._split_train_test_sets(test_percentage=0.2) def _collect_annotations(self): self.annos = [] for anno_file in os.listdir(self.anno_dir): video_id = '_'.join(anno_file.split('_')[:2]) print(video_id) with open(os.path.join(self.anno_dir, anno_file), 'r') as f: for line in f.readlines(): anno = json.loads(line) anno['VideoID'] = video_id if anno['Salutation'] != 'null': self.annos.append(anno) def _split_train_test_sets(self, test_percentage=0.2): # Copy from XiaoduHiDataloaderv2 videos = set([anno['VideoID'] for anno in self.annos]) num_test = int(len(videos) * test_percentage) ids = np.arange(len(videos)) np.random.shuffle(ids) videos = [list(videos)[i] for i in ids] test_videos = set(videos[:num_test]) self.test_annos, self.train_annos = [], [] for anno in self.annos: if anno['VideoID'] in test_videos: self.test_annos.append(anno) else: self.train_annos.append(anno) ids = np.arange(len(self.train_annos)) np.random.shuffle(ids) self.train_annos = [self.train_annos[i] for i in list(ids)] def _process_single_anno(self, idx, anno, txt, data_dir): if not hasattr(self, "scene_sensor"): self.scene_sensor = SceneSensor( self.yolov4_model_dir, gpu=self.gpu, img_shape=[3, 416, 416], roi_feat_resolution=self.roi_feat_resolution, algorithm='yolov4') # Read annos and data track_states_file = os.path.join( self.video_tracking_dir, '{}_states.pkl'.format(anno['VideoID'])) with open(track_states_file, 'rb') as f: track_states = pickle.load(f) video_file = os.path.join( self.video_tracking_dir, '{}.mp4'.format(anno['VideoID'])) frames = clip_video_to_frames(video_file, 0.0, None) # Extract frames related_frames, related_tracks = [], [] for frame, (tracks, bboxes) in zip(frames, track_states): if anno['ID'] not in tracks: continue related_frames.append(frame) related_tracks.append(tracks[anno['ID']]) instances_lst = self.scene_sensor.get_instances_with_feats( related_frames, get_full_fm=False) for frame, instances, track in zip( related_frames, instances_lst, related_tracks): _, inst_id = max_iou(track, instances, return_id=True) if inst_id == -1: warnings.warn( 'Cannot find corresponding instance for track in ' 'anno: {}\n'.format(anno)) continue x1, y1, x2, y2 = instances[inst_id]['bbox'] cv2.imwrite(os.path.join(data_dir, '{}.jpg'.format(idx)), frame[int(y1):int(y2), int(x1):int(x2)]) np.save(os.path.join(data_dir, '{}.npy'.format(idx)), instances[inst_id]['fm']) with open(txt, 'a') as f: if anno['Salutation'] == 'man': tree_mask, cls0, cls1, cls2 = '100', 0, -1, -1 elif anno['Salutation'] == 'woman': tree_mask, cls0, cls1, cls2 = '100', 1, -1, -1 elif anno['Salutation'] == 'young_boy': tree_mask, cls0, cls1, cls2 = '110', 0, 0, -1 elif anno['Salutation'] == 'uncle': tree_mask, cls0, cls1, cls2 = '110', 0, 1, -1 elif anno['Salutation'] == 'young_girl': tree_mask, cls0, cls1, cls2 = '101', 1, -1, 0 elif anno['Salutation'] == 'aunt': tree_mask, cls0, cls1, cls2 = '101', 1, -1, 1 f.write('{} {} {} {} {}\n'.format( idx, tree_mask, cls0, cls1, cls2)) idx += 1 return idx def build_dataset(self, output_dir): train_dir = os.path.join(output_dir, 'train') test_dir = os.path.join(output_dir, 'test') train_txt = os.path.join(output_dir, 'train.txt') test_txt = os.path.join(output_dir, 'test.txt') if not os.path.exists(train_dir): os.makedirs(train_dir) if not os.path.exists(test_dir): os.makedirs(test_dir) for txt, data_dir, annos in zip( [test_txt, train_txt], [test_dir, train_dir], [self.test_annos, self.train_annos]): print('Generating {}'.format(txt)) idx = 0 for anno in annos: idx = self._process_single_anno(idx, anno, txt, data_dir) print(idx)
def worker_func(yolov4_model_dir, video_dir, anno_lst, data_queue, msg_queue, conf_dict): video_aug = VideoAugmentorV2() scene_sensor = SceneSensor( yolov4_model_dir, gpu=conf_dict['gpu'], img_shape=[3, 416, 416], roi_feat_resolution=conf_dict['roi_feat_resolution'], algorithm='yolov4') def _process_neg_frames(anno): if check_passive_interaction(anno['Path']): # Ignore examples in which someone is interacting the robot return try: frames = read_all_frames(anno['Path']) except Exception: warnings.warn('OpenCV IO error. Reading {}'.format( anno['Path'])) return frames = sample_frames(frames, conf_dict['ob_window_len']) h, w, _ = frames[0].shape if h / w == 480 / 640: frames = [cv2.resize(i, (640, 480)) for i in frames] elif h / w == 720 / 1280: frames = [cv2.resize(i, (1280, 720)) for i in frames] instances_lst = scene_sensor.get_instances_with_feats( frames, get_full_fm=False) success, data = convert_instances_lst_to_data( instances_lst, conf_dict['tokens_per_frame'], {}, [], anno['WAE_id'], conf_dict['inst_crop_shape'], conf_dict['inst_fm_shape'], conf_dict['inst_pos_dim'], conf_dict['inst_cls_dim'], conf_dict['visual_token_dim']) if success: data_queue.put(data) else: warnings.warn( 'Failed to process annotation: {}\n'.format(anno)) def _process_single_anno(anno): if anno['VideoType'] == 'neg_frames': _process_neg_frames(anno) return te = timestamp_to_ms(anno['Time']) ts = te - conf_dict['ob_window_len'] * conf_dict['interval'] frames_dir = os.path.join(video_dir, anno['VideoID']) # print('=================', frames_dir, anno['VideoType']) if conf_dict['use_frames_first'] and os.path.isdir(frames_dir): # Read images try: frames = read_frames_dir(frames_dir, max(0.0, ts), te) frames = sample_frames(frames, conf_dict['ob_window_len']) ctx_frames = read_frames_dir(frames_dir, 0.0, te) except Exception: warnings.warn('OpenCV IO error. Reading {}'.format( frames_dir)) return h, w, _ = frames[0].shape if h / w == 480 / 640: frames = [cv2.resize(i, (640, 480)) for i in frames] elif h / w == 720 / 1280: frames = [cv2.resize(i, (1280, 720)) for i in frames] else: # Read video video_file = os.path.join( video_dir, '{}.mp4'.format(anno['VideoID'])) frames = clip_video_to_frames(video_file, max(0.0, ts), te) frames = sample_frames(frames, conf_dict['ob_window_len']) ctx_frames = clip_video_to_frames(video_file, 0.0, te) track_states_file = os.path.join( video_dir, '{}_states.pkl'.format(anno['VideoID'])) with open(track_states_file, 'rb') as f: track_states = pickle.load(f) last_frame_tracks = track_states[len(ctx_frames)-1][0] obj_ids = anno['ID'].split(',') if anno['ID'] != '' else [] check_passed = True for idx in obj_ids: check_passed = check_passed and idx in last_frame_tracks if not check_passed: warnings.warn( 'Failed to process annotation: {}\n'.format(anno)) return if conf_dict['augment']: while True: aug_frames = video_aug(frames) instances = scene_sensor.get_instances(aug_frames[-1:])[0] iou_lst = [max_iou(last_frame_tracks[idx], instances) for idx in obj_ids] if len(iou_lst) == 0 or min(iou_lst) > 0.5: break frames = aug_frames instances_lst = scene_sensor.get_instances_with_feats( frames, get_full_fm=False) success, data = convert_instances_lst_to_data( instances_lst, conf_dict['tokens_per_frame'], last_frame_tracks, obj_ids, anno['WAE_id'], conf_dict['inst_crop_shape'], conf_dict['inst_fm_shape'], conf_dict['inst_pos_dim'], conf_dict['inst_cls_dim'], conf_dict['visual_token_dim']) if success: data_queue.put(data) else: warnings.warn( 'Failed to process annotation: {}\n'.format(anno)) while True: msg = msg_queue.get() if msg == 'stop': break elif msg == 'new_epoch': for anno in anno_lst: if conf_dict['read_cache'] and 'VideoID' in anno: cache_file = '{}_{}_cache.pkl'.format( anno['VideoID'], stable_anno_hash(anno)) cache_file = os.path.join(video_dir, cache_file) if os.path.exists(cache_file): with open(cache_file, 'rb') as f: data = pickle.load(f) data_queue.put(data) else: _process_single_anno(anno) else: _process_single_anno(anno) elif len(msg) == 2 and msg[0] == 'update': anno_lst = msg[1]
def _process_single_anno(self, idx, anno, txt, data_dir): if not hasattr(self, "scene_sensor"): self.scene_sensor = SceneSensor( self.yolov4_model_dir, gpu=self.gpu, img_shape=[3, 416, 416], roi_feat_resolution=self.roi_feat_resolution, algorithm='yolov4') # Read annos and data track_states_file = os.path.join( self.video_tracking_dir, '{}_states.pkl'.format(anno['VideoID'])) with open(track_states_file, 'rb') as f: track_states = pickle.load(f) video_file = os.path.join( self.video_tracking_dir, '{}.mp4'.format(anno['VideoID'])) frames = clip_video_to_frames(video_file, 0.0, None) # Extract frames related_frames, related_tracks = [], [] for frame, (tracks, bboxes) in zip(frames, track_states): if anno['ID'] not in tracks: continue related_frames.append(frame) related_tracks.append(tracks[anno['ID']]) instances_lst = self.scene_sensor.get_instances_with_feats( related_frames, get_full_fm=False) for frame, instances, track in zip( related_frames, instances_lst, related_tracks): _, inst_id = max_iou(track, instances, return_id=True) if inst_id == -1: warnings.warn( 'Cannot find corresponding instance for track in ' 'anno: {}\n'.format(anno)) continue x1, y1, x2, y2 = instances[inst_id]['bbox'] cv2.imwrite(os.path.join(data_dir, '{}.jpg'.format(idx)), frame[int(y1):int(y2), int(x1):int(x2)]) np.save(os.path.join(data_dir, '{}.npy'.format(idx)), instances[inst_id]['fm']) with open(txt, 'a') as f: if anno['Salutation'] == 'man': tree_mask, cls0, cls1, cls2 = '100', 0, -1, -1 elif anno['Salutation'] == 'woman': tree_mask, cls0, cls1, cls2 = '100', 1, -1, -1 elif anno['Salutation'] == 'young_boy': tree_mask, cls0, cls1, cls2 = '110', 0, 0, -1 elif anno['Salutation'] == 'uncle': tree_mask, cls0, cls1, cls2 = '110', 0, 1, -1 elif anno['Salutation'] == 'young_girl': tree_mask, cls0, cls1, cls2 = '101', 1, -1, 0 elif anno['Salutation'] == 'aunt': tree_mask, cls0, cls1, cls2 = '101', 1, -1, 1 f.write('{} {} {} {} {}\n'.format( idx, tree_mask, cls0, cls1, cls2)) idx += 1 return idx
def setup_class(self): self.scene_sensor = SceneSensor(YOLOv3_MODEL, gpu=0, algorithm='yolov3') self.frames = clip_video_to_frames(VIDEO, 3001., 4000.)