def new_pred(i1: Interval, i2: Interval) -> bool: assert trajectory_key in i1.payload assert trajectory_key in i2.payload if not id(i1) < id(i2) \ or not same_time(fps*10)(i1, i2) \ or not iou_at_least(0.5)(i1, i2): return False logger.debug("checking trajecory corr") def get_txy(i): # returns .shape=(n, 3). Each row is (t, x, y) return np.array([[ j['t1'], ] + list(centroid(j)) for j in i.payload[trajectory_key]]) txy_1, txy_2 = get_txy(i1), get_txy(i2) ts = txy_1[:, 0] # use 1's time as reference txy_2 = np.stack((ts, np.interp(ts, txy_2[:, 0], txy_2[:, 1]), np.interp(ts, txy_2[:, 0], txy_2[:, 2])), axis=1) # logger.debug(f"txy_1={txy_1}\ntxy_2={txy_2}") corr_x = np.corrcoef(txy_1[:, 1], txy_2[:, 1])[0, 1] corr_y = np.corrcoef(txy_1[:, 2], txy_2[:, 2])[0, 1] logger.debug(f"corr_x={corr_x}, corr_y={corr_y}") return corr_x >= corrcoef and corr_y >= corrcoef
def test_coalesce_iou_distance(self): input_list = self.intrvl_list_iou_distance target = [ Interval(Bounds3D(0, 11, 0., .52, 0., .52), {'msg': 'first person starts at frame 0'}), Interval( Bounds3D(5, 6, .1, .6, .1, .6), { 'msg': 'second person starts at frame 5 and overlaps with first too' }), ] output = Coalesce( axis=('t1', 't2'), bounds_merge_op=Bounds3D.span, predicate=and_pred( before(), iou_at_least(0.001)), # both persons at frame 5 should pass epsilon=6, distance=_iou)(FromIterable(input_list)()) results = run_to_finish(output) self.assertIntervalListEq(results, target, bounds_only=True)
def query(path, session): cv2.setNumThreads(8) query_result = {} decoder = LocalVideoDecoder(path) frame_count, fps = decoder.frame_count, int(np.round(decoder.fps)) query_result['metadata'] = { 'fps': fps, 'frame_count': frame_count, 'width': decoder.raw_width, 'height': decoder.raw_height, } query_result['results'] = list() del decoder detect_step = 30 person_1 = DetectionFilterFlatten(['person'], 0.3)(CachedOkutamaDetection(path, OKUTAMA_CACHE_DIR)()) person_2 = DetectionFilterFlatten(['person'], 0.3)(CachedOkutamaDetection(path, OKUTAMA_CACHE_DIR)()) def is_handshake(i1: Interval, i2: Interval) -> bool: dx, dy = centroid(i1) - centroid(i2) return i1['t1'] == i2['t1'] and i1['x1'] < i2['x1'] and dx <= 1.5*i1.bounds.width() and dy <= 1.5*i1.bounds.height() def merge_op(i1, i2): new_bounds = i2.bounds.span(i1) new_bounds['t2'] = new_bounds['t2'] return Interval(new_bounds) handshake_event = Join( predicate=is_handshake, merge_op=merge_op, window=detect_step )(person_1, person_2) # dedup and merge final results handshake_event = Coalesce(predicate=iou_at_least(.001),epsilon=detect_step)(handshake_event) if GET_MP4: vis_decoder = LRULocalVideoDecoder(path, cache_size=300) raw_fg = VideoCropFrameGroup(vis_decoder, copy_payload=True)(handshake_event) output = raw_fg else: output = handshake_event output_sub = output.subscribe() output.start_thread_recursive() for _, intrvl in enumerate(output_sub): if GET_MP4: query_result['results'].append((intrvl.bounds.copy(), intrvl.get_mp4())) else: query_result['results'].append((intrvl.bounds.copy(), b'')) del intrvl gc.collect() return query_result
def test_coalesce_iou_early_late_epsilon(self): input_list = self.intrvl_list_iou_early_late_epsilon target = [ Interval(Bounds3D(0, 11, 0., .52, 0., .52), {'msg': 'first person .'}), Interval(Bounds3D(3, 8, 0.49, 1., .49, 1.), {'msg': 'second person .'}), Interval(Bounds3D(100, 120, 0., .8, .0, .8), {'msg': 'third person starts .'}), ] output = Coalesce(axis=('t1', 't2'), bounds_merge_op=Bounds3D.span, predicate=iou_at_least(0.8), epsilon=6)(FromIterable(input_list)()) results = run_to_finish(output) # print(results) self.assertIntervalListEq(results, target, bounds_only=True)
def trajectory_merge_predicate(i1, i2): return meets_before(detect_step*2)(i1, i2) \ and iou_at_least(0.1)(i1.payload['trajectory'][-1], i2.payload['trajectory'][0])
def query(path, session): cv2.setNumThreads(8) query_result = {} decoder = LocalVideoDecoder(path) frame_count, fps = decoder.frame_count, int(np.round(decoder.fps)) query_result['metadata'] = { 'fps': fps, 'frame_count': frame_count, 'width': decoder.raw_width, 'height': decoder.raw_height, } query_result['results'] = list() del decoder detect_step = int(fps) all_frames = VideoToFrames(LocalVideoDecoder(path))() sampled_frames = Slice(step=detect_step)(all_frames) # detections = Detection(server_list=DETECTION_SERVERS, parallel=2)(sampled_frames) detections = CachedVIRATDetection(path, cache_dir=VIRAT_CACHE_DIR)(sampled_frames) crop_cars = DetectionFilterFlatten(['car', 'truck', 'bus'], 0.3)(detections) stopped_cars = Coalesce( predicate=iou_at_least(0.7), bounds_merge_op=Bounds3D.span, epsilon=detect_step*3 )(crop_cars) # further de-dup as detection boxes can be flashy stopped_cars = Coalesce( predicate=iou_at_least(0.5), bounds_merge_op=Bounds3D.span, payload_merge_op=lambda p1, p2: {}, # drop the rgb epsilon=detect_step*3 )(stopped_cars) stopped_cars = Filter(pred_fn=lambda i: i.bounds.length() >= 3*fps)(stopped_cars) # buffer all stopped cars stopped_cars_sub = stopped_cars.subscribe() stopped_cars.start_thread_recursive() buffered_stopped_cars = list(stopped_cars_sub) logger.info(f"Find {len(buffered_stopped_cars)} stopped cars.") query_result['stopped_cars'] = len(buffered_stopped_cars) # Stage 2: reprocess buffered stopped cars gc.collect() stopped_cars_1 = FromIterable(buffered_stopped_cars)() def dilate_car(icar: Interval) -> Interval: carh, carw = _height(icar), _width(icar) new_bounds = Bounds3D( t1=int(max(0, icar['t1']-fps)), t2=int(min(frame_count, icar['t2']+fps)), x1=max(0, icar['x1'] - carw), x2=min(1, icar['x2'] + carw), y1=max(0, icar['y1'] - carh), y2=min(1, icar['y2'] + carh) ) return Interval(new_bounds) dialted_stopped_cars = Map(dilate_car)(stopped_cars_1) # sample single-frame bounds from redetect_volumnes for object detection redetect_bounds = Flatten( flatten_fn=lambda i: \ [ Interval(Bounds3D(t1, t1+1, i['x1'], i['x2'], i['y1'], i['y2'])) \ for t1 in range(int(i['t1']), int(i['t2']), detect_step) ] )(dialted_stopped_cars) redetect_bounds = Sort(window=frame_count)(redetect_bounds) redetect_fg = VideoCropFrameGroup(LRULocalVideoDecoder(path, cache_size=300), name="crop_redetect_volume")(redetect_bounds) redetect_patches = Flatten( flatten_fn=lambda fg: fg.to_image_intervals() )(redetect_fg) # Don't sample again here redetect_detection = Detection(server_list=DETECTION_SERVERS, parallel=len(DETECTION_SERVERS))(redetect_patches) redetect_person = DetectionFilterFlatten(['person'], 0.3)(redetect_detection) stationary_person = Coalesce(predicate=iou_at_least(0.1), epsilon=2*detect_step)(redetect_person) stationary_person = Filter(pred_fn=lambda i: i.bounds.length()>=2*fps)(stationary_person) def loading_merge_op(i_person, i_vehicle): new_bounds = i_vehicle.bounds.span(i_person) new_bounds['t1'] = i_person['t1'] new_bounds['t2'] = i_person['t2'] return Interval(new_bounds) loading_event = Join( predicate=and_pred(iou_at_least(0.1), during()), merge_op=loading_merge_op )(stationary_person, FromIterable(buffered_stopped_cars)()) # dedup and merge final results loading_event = Coalesce(predicate=overlaps())(loading_event) if GET_MP4: vis_decoder = LRULocalVideoDecoder(path, cache_size=300) raw_fg = VideoCropFrameGroup(vis_decoder, copy_payload=True)(loading_event) output = raw_fg else: output = loading_event output_sub = output.subscribe() output.start_thread_recursive() for _, intrvl in enumerate(output_sub): if GET_MP4: query_result['results'].append((intrvl.bounds.copy(), intrvl.get_mp4())) else: query_result['results'].append((intrvl.bounds.copy(), b'')) del intrvl gc.collect() return query_result
def new_pred(i1: Interval, i2: Interval) -> bool: return meets_before(epsilon)(i1, i2) \ and iou_at_least(iou_thres)(i1.payload[key][-1], i2.payload[key][0])
)(track_car_trajectories) long_person_trajectories = Filter( pred_fn=lambda intrvl: intrvl.bounds.length() >= fps * 5 )(merged_person_trajectories) long_car_trajectories = Filter( pred_fn=lambda intrvl: intrvl.bounds.length() >= fps * 5 )(merged_car_trajectories) # de-dup car trajs long_car_trajectories = Coalesce( bounds_merge_op=Bounds3D.span, payload_merge_op=lambda p1, p2: p1, predicate=iou_at_least(0.5), epsilon=fps*60*5, )(long_car_trajectories) crosswalk_patches = JoinWithTimeWindow( predicate=is_crosswalk(), merge_op=crosswalk_merge('traj_person', 'traj_car'), window=frame_count, name="join_crosswalk" )(long_person_trajectories, long_car_trajectories) if SAVE_VIDEO: vis_decoder = LRULocalVideoDecoder(INPUT_NAME, cache_size=900, resize=600) raw_fg = VideoCropFrameGroup(vis_decoder, copy_payload=True, parallel=4)(crosswalk_patches)
def query(path, session): cv2.setNumThreads(8) query_result = {} decoder = LocalVideoDecoder(path) frame_count, fps = decoder.frame_count, int(np.round(decoder.fps)) query_result['metadata'] = { 'fps': fps, 'frame_count': frame_count, 'width': decoder.raw_width, 'height': decoder.raw_height, } query_result['results'] = list() del decoder detect_step = int(fps) all_frames = VideoToFrames(LocalVideoDecoder(path))() sampled_frames = Slice(step=detect_step)(all_frames) # detections = Detection(server_list=DETECTION_SERVERS, parallel=2)(sampled_frames) detections = CachedVIRATDetection( path, cache_dir=VIRAT_CACHE_DIR)(sampled_frames) crop_cars = DetectionFilterFlatten(['car', 'truck', 'bus'], 0.3)(detections) stopped_cars = Coalesce(predicate=iou_at_least(0.7), bounds_merge_op=Bounds3D.span, epsilon=detect_step * 3)(crop_cars) # further de-dup as detection boxes can be flashy stopped_cars = Coalesce( predicate=iou_at_least(0.5), bounds_merge_op=Bounds3D.span, payload_merge_op=lambda p1, p2: {}, # drop the rgb epsilon=detect_step * 3)(stopped_cars) stopped_cars = Filter( pred_fn=lambda i: i.bounds.length() >= 3 * fps)(stopped_cars) # buffer all stopped cars stopped_cars_sub = stopped_cars.subscribe() stopped_cars.start_thread_recursive() buffered_stopped_cars = list(stopped_cars_sub) logger.info(f"Find {len(buffered_stopped_cars)} stopped cars.") query_result['stopped_cars'] = len(buffered_stopped_cars) # Stage 2: reprocess buffered stopped cars gc.collect() stopped_cars_1 = FromIterable(buffered_stopped_cars)() def dilate_car(icar: Interval) -> Interval: carh, carw = _height(icar), _width(icar) new_bounds = Bounds3D(t1=int(max(0, icar['t1'] - fps)), t2=int(min(frame_count, icar['t2'] + fps)), x1=max(0, icar['x1'] - carw), x2=min(1, icar['x2'] + carw), y1=max(0, icar['y1'] - carh), y2=min(1, icar['y2'] + carh)) return Interval(new_bounds) dialted_stopped_cars = Map(dilate_car)(stopped_cars_1) # sample single-frame bounds from redetect_volumnes for object detection redetect_bounds = Flatten( flatten_fn=lambda i: \ [ Interval(Bounds3D(t1, t1+1, i['x1'], i['x2'], i['y1'], i['y2'])) \ for t1 in range(int(i['t1']), int(i['t2']), detect_step) ] )(dialted_stopped_cars) redetect_bounds = Sort(window=frame_count)(redetect_bounds) redetect_fg = VideoCropFrameGroup( LRULocalVideoDecoder(path, cache_size=300), name="crop_redetect_volume")(redetect_bounds) redetect_patches = Flatten( flatten_fn=lambda fg: fg.to_image_intervals())(redetect_fg) # Don't sample again here redetect_detection = Detection( server_list=DETECTION_SERVERS, parallel=len(DETECTION_SERVERS))(redetect_patches) redetect_person = DetectionFilterFlatten(['person'], 0.3)(redetect_detection) rekey = 'traj_person' short_person_trajectories = TrackFromBox( LRULocalVideoDecoder(path, cache_size=300), window=detect_step, step=2, trajectory_key=rekey, bidirectional=True, parallel_workers=2, name='track_person')(redetect_person) short_person_trajectories = Sort(2 * detect_step)(short_person_trajectories) long_person_trajectories = Coalesce( predicate=traj_concatable(3 * detect_step, 0.1, rekey), bounds_merge_op=Bounds3D.span, payload_merge_op=traj_concat_payload(rekey), epsilon=10 * detect_step)(short_person_trajectories) long_person_trajectories = Filter(lambda i: i.bounds.length() > 3 * fps)( long_person_trajectories) def merge_op_getout(ic, ip): new_bounds = ic.bounds.span(ip) new_bounds['t1'] = max(0, ip['t1'] - 3 * fps) # wind back 3 seconds new_bounds['t2'] = min(frame_count, ip['t1'] + fps) new_payload = {rekey: ip.payload[rekey]} return Interval(new_bounds, new_payload) get_out = JoinWithTimeWindow( lambda ic, ip: ic['t1'] < ip['t1'] < ic['t2'] and _iou( ic, ip.payload[rekey][0]) > 0.05, merge_op=merge_op_getout, window=5 * 60 * fps)(FromIterable(buffered_stopped_cars)(), long_person_trajectories) # dedup and merge final results get_out = Coalesce(predicate=overlaps())(get_out) if GET_MP4: vis_decoder = LRULocalVideoDecoder(path, cache_size=900) raw_fg = VideoCropFrameGroup(vis_decoder, copy_payload=True)(get_out) visualize_fg = VisualizeTrajectoryOnFrameGroup( rekey, name="visualize-person-traj")(raw_fg) output = visualize_fg else: output = get_out output_sub = output.subscribe() output.start_thread_recursive() for _, intrvl in enumerate(output_sub): if GET_MP4: query_result['results'].append( (intrvl.bounds.copy(), intrvl.get_mp4())) else: query_result['results'].append((intrvl.bounds.copy(), b'')) del intrvl gc.collect() return query_result
if __name__ == "__main__": os.makedirs(OUTPUT_DIR, exist_ok=True) fps = 15 sample_every = 3 all_frames = VideoToFrames(LocalVideoDecoder(INPUT_NAME))() sampled_frames = Slice(step=sample_every)(all_frames) detections = Detection('cloudlet031.elijah.cs.cmu.edu', 5000)(sampled_frames) crop_persons = DetectionFilterFlatten(['person'], 0.3)(detections) # Try `Coalesce()`. Will have different results coalesced_persons = CoalesceByLast( bounds_merge_op=Bounds3D.span, predicate=iou_at_least(0.1), epsilon=sample_every*3)(crop_persons) long_coalesced_persons = Filter( pred_fn=lambda intrvl: intrvl.bounds.length() > fps * 3 # 3 seconds )(coalesced_persons) fg = LocalVideoCropFrameGroup(INPUT_NAME)(long_coalesced_persons) output = fg output_sub = output.subscribe() output.start_thread_recursive() for k, intrvl in enumerate(output_sub): assert isinstance(intrvl, FrameGroupInterval) out_name = f"{OUTPUT_DIR}/{k}-{intrvl['t1']}-{intrvl['t2']}-{intrvl['x1']:.2f}-{intrvl['y1']:.2f}.mp4" intrvl.savevideo(out_name, fps=fps)
os.makedirs(OUTPUT_DIR, exist_ok=True) fps = 15 sample_every = 3 all_frames = VideoToFrames(LocalVideoDecoder(INPUT_NAME))() sampled_frames = Slice(step=sample_every)(all_frames) detections = Detection('cloudlet031.elijah.cs.cmu.edu', 5000)(sampled_frames) # this is a fork in the compute graph crop_persons = DetectionFilterFlatten(['person'], 0.5)(detections) crop_cars = DetectionFilterFlatten(['car'], 0.5)(detections) crop_person_and_car = JoinWithTimeWindow( predicate=lambda i1, i2: i1['t1'] == i2['t1'] and iou_at_least(0.01) (i1, i2), merge_op=lambda i1, i2: ImageInterval(i1.bounds.span(i2), root=i1.root ), window=1)(crop_persons, crop_cars) output = crop_person_and_car output_sub = output.subscribe() output.start_thread_recursive() logger.info("This may take a while") for k, intrvl in enumerate(output_sub): assert isinstance(intrvl, ImageInterval) out_name = f"{OUTPUT_DIR}/{k}-{intrvl['t1']}-{intrvl['t2']}-{intrvl['x1']:.2f}-{intrvl['y1']:.2f}.jpg" intrvl.savefile(out_name) logger.debug(f"saved {out_name}")
parallel=8)(detections) else: # use expensive trackers crop_persons = DetectionFilterFlatten(['person'], 0.5)(detections) track_person_trajectories = TrackFromBox( LRULocalVideoDecoder(INPUT_NAME, cache_size=900), detect_step, step=2, trajectory_key='traj_person', parallel_workers=12, name='track_person')(crop_persons) crop_cars = DetectionFilterFlatten(['car'], 0.5)(detections) stopped_cars = Coalesce(predicate=iou_at_least(0.5), bounds_merge_op=Bounds3D.span, epsilon=fps * 3)(crop_cars) stopped_cars = Filter( pred_fn=lambda i: i.bounds.length() >= 3 * fps)(stopped_cars) merged_person_trajectories = Coalesce( predicate=traj_concatable(3, 0.1, 'traj_person'), bounds_merge_op=Bounds3D.span, payload_merge_op=traj_concat_payload('traj_person'), epsilon=10)(track_person_trajectories) long_person_trajectories = Filter( pred_fn=lambda intrvl: intrvl.bounds.length() >= 3 * fps)( merged_person_trajectories)
def query(path, session): cv2.setNumThreads(8) query_result = {} decoder = LocalVideoDecoder(path) frame_count, fps = decoder.frame_count, int(np.round(decoder.fps)) query_result['metadata'] = { 'fps': fps, 'frame_count': frame_count, 'width': decoder.raw_width, 'height': decoder.raw_height, } query_result['results'] = list() del decoder detect_step = int(fps) all_frames = VideoToFrames(LocalVideoDecoder(path))() sampled_frames = Slice(step=detect_step)(all_frames) # detections = Detection(server_list=DETECTION_SERVERS, parallel=16)(sampled_frames) detections = CachedVIRATDetection( path, cache_dir="../virat_experiment/cache")(sampled_frames) crop_cars = DetectionFilterFlatten(['car'], 0.5)(detections) stopped_cars = Coalesce(predicate=iou_at_least(0.7), bounds_merge_op=Bounds3D.span, epsilon=detect_step * 3)(crop_cars) # further de-dup as detection boxes can be flashy stopped_cars = Coalesce( predicate=iou_at_least(0.5), bounds_merge_op=Bounds3D.span, payload_merge_op=lambda p1, p2: {}, # drop the rgb epsilon=detect_step * 3)(stopped_cars) stopped_cars = Filter( pred_fn=lambda i: i.bounds.length() >= 3 * fps)(stopped_cars) # buffer all stopped cars stopped_cars_sub = stopped_cars.subscribe() stopped_cars.start_thread_recursive() buffered_stopped_cars = list(stopped_cars_sub) logger.info( f"Find {len(buffered_stopped_cars)} stopped cars. {list(c.bounds for c in buffered_stopped_cars)}" ) query_result['stopped_cars'] = len(buffered_stopped_cars) # Stage 2: reprocess buffered stopped cars gc.collect() stopped_cars_1 = FromIterable(buffered_stopped_cars)() def dilate_car(icar: Interval) -> Interval: carh, carw = _height(icar), _width(icar) new_bounds = Bounds3D(t1=int(max(0, icar['t1'] - fps)), t2=int(min(frame_count, icar['t2'] + fps)), x1=max(0, icar['x1'] - carw), x2=min(1, icar['x2'] + carw), y1=max(0, icar['y1'] - carh), y2=min(1, icar['y2'] + carh)) return Interval(new_bounds) dialted_stopped_cars = Map(dilate_car)(stopped_cars_1) # sample single-frame bounds from redetect_volumnes for object detection redetect_bounds = Flatten( flatten_fn=lambda i: \ [ Interval(Bounds3D(t1, t1+1, i['x1'], i['x2'], i['y1'], i['y2'])) \ for t1 in range(int(i['t1']), int(i['t2']), detect_step) ] )(dialted_stopped_cars) redetect_bounds = Sort(window=frame_count)(redetect_bounds) redetect_fg = VideoCropFrameGroup( LRULocalVideoDecoder(path, cache_size=900), name="crop_redetect_volume")(redetect_bounds) redetect_patches = Flatten( flatten_fn=lambda fg: fg.to_image_intervals())(redetect_fg) # redetect_patches = Log("redetect_patches")(redetect_patches) # we already sample when generating `redetect_bounds`. Don't sample again here. redetect_detection = Detection(server_list=DETECTION_SERVERS, parallel=16)(redetect_patches) redetect_person = DetectionFilterFlatten(['person'], 0.3)(redetect_detection) # redetect_person = Log("redetect_person")(redetect_person) rekey = 'traj_person' short_person_trajectories = TrackFromBox( LRULocalVideoDecoder(path, cache_size=900), window=detect_step, step=2, trajectory_key=rekey, bidirectional=True, parallel_workers=2, name='track_person')(redetect_person) long_person_trajectories = Coalesce( predicate=traj_concatable(3 * detect_step, 0.1, rekey), bounds_merge_op=Bounds3D.span, payload_merge_op=traj_concat_payload(rekey), # distance=lambda i1, i2: _iou(i1.payload[rekey][-1], i2.payload[rekey][0]), epsilon=10 * detect_step)(short_person_trajectories) def merge_op_getout(ic, ip): new_bounds = ic.bounds.span(ip) # new_bounds['t1'] = max(0, ip['t1'] - 10*fps) # wind back 3 seconds # new_bounds['t2'] = min(frame_count, ip['t1'] + 10*fps) new_payload = {rekey: ip.payload[rekey]} return Interval(new_bounds, new_payload) get_out = JoinWithTimeWindow( lambda ic, ip: ic['t1'] < ip['t1'] < ic['t2'] and _iou( ic, ip.payload[rekey][0]) > 0.05, merge_op=merge_op_getout)(FromIterable(buffered_stopped_cars)(), long_person_trajectories) # dedup final results get_out = Coalesce(predicate=and_pred( iou_at_least(0.5), or_pred(during_inv(), tiou_at_least(0.5))), payload_merge_op=lambda p1, p2: p1)(get_out) vis_decoder = LRULocalVideoDecoder(path, cache_size=900) raw_fg = VideoCropFrameGroup(vis_decoder, copy_payload=True)(get_out) visualize_fg = VisualizeTrajectoryOnFrameGroup( rekey, name="visualize-person-traj")(raw_fg) output = visualize_fg output_sub = output.subscribe() output.start_thread_recursive() for k, intrvl in enumerate(output_sub): assert isinstance(intrvl, FrameGroupInterval) out_name = f"{OUTPUT_DIR}/{k}-{intrvl['t1']}-{intrvl['t2']}-{intrvl['x1']:.2f}-{intrvl['y1']:.2f}-{intrvl['x2']:.2f}-{intrvl['y2']:.2f}.mp4" intrvl.savevideo(out_name, fps=fps) logger.info(f"saved {out_name}")