def interview_with_person_x(): from query.models import LabeledCommercial, FaceIdentity from rekall.video_interval_collection import VideoIntervalCollection from rekall.temporal_predicates import before, after, overlaps from rekall.logical_predicates import or_pred from esper.rekall import intrvllists_to_result # Get list of sandbox video IDs sandbox_videos = [ row.video_id for row in LabeledCommercial.objects.distinct('video_id') ] guest_name = "bernie sanders" # Load hosts and instances of guest from SQL identities = FaceIdentity.objects.filter(face__shot__video_id__in=sandbox_videos) hosts_qs = identities.filter(face__is_host=True) guest_qs = identities.filter(identity__name=guest_name).filter(probability__gt=0.7) # Put bounding boxes in SQL hosts = VideoIntervalCollection.from_django_qs( hosts_qs.annotate(video_id=F("face__shot__video_id"), min_frame=F("face__shot__min_frame"), max_frame=F("face__shot__max_frame")) ) guest = VideoIntervalCollection.from_django_qs( guest_qs.annotate(video_id=F("face__shot__video_id"), min_frame=F("face__shot__min_frame"), max_frame=F("face__shot__max_frame")) ) # Get all shots where the guest and a host are on screen together guest_with_host = guest.overlaps(hosts).coalesce() # This temporal predicate defines A overlaps with B, or A before by less than 10 frames, # or A after B by less than 10 frames overlaps_before_or_after_pred = or_pred( or_pred(overlaps(), before(max_dist=10), arity=2), after(max_dist=10), arity=2) # This code finds sequences of: # guest with host overlaps/before/after host OR # guest with host overlaps/before/after guest interview_candidates = guest_with_host \ .merge(hosts, predicate=overlaps_before_or_after_pred) \ .set_union(guest_with_host.merge( guest, predicate=overlaps_before_or_after_pred)) \ .coalesce() # Sequences may be interrupted by shots where the guest or host don't # appear, so dilate and coalesce to merge neighboring segments interviews = interview_candidates \ .dilate(600) \ .coalesce() \ .dilate(-600) \ .filter_length(min_length=1350) # Return intervals return intrvllists_to_result(interviews.get_allintervals())
def fold_fn(stack, interval): if interval.length() > MAX_COMMERCIAL_TIME: interval = Interval(interval.start, interval.start + MAX_COMMERCIAL_TIME, interval.payload) if len(stack) == 0: stack.append(interval) else: last = stack.pop() if or_pred(overlaps(), after(max_dist=5), arity=2)(interval, last): if last.merge(interval).length() > MAX_COMMERCIAL_TIME: stack.append(Interval( last.start, last.start + MAX_COMMERCIAL_TIME, last.payload)) else: stack.append(last.merge(interval)) else: stack.append(last) stack.append(interval) return stack
def detect_commercial_rekall(video, transcript_path, blackframe_list=None, histogram=None, verbose=True): """ API for detecting commercial blocks from TV news video using rekall @video: django query set @transcript_path: transcript_path @blackframe_list: list of black frames index @histogram: list of histogram 16x3 bin for each frame, not used if blackframe_list is provided Return: commercial_list (list of tuple((start_fid, start_sec), (end_fid, end_sec)), None if failed) """ transcript = load_transcript(transcript_path) if blackframe_list is None: blackframe_intervallist = get_blackframe_list(histogram) else: blackframe_intervallist = IntervalList([ (fid2second(fid, video.fps), fid2second(fid + 1, video.fps), 0) for fid in blackframe_list ]) black_windows = blackframe_intervallist \ .dilate(1. / video.fps) \ .coalesce() \ .dilate(-1. / video.fps) \ .filter_length(min_length=MIN_BLACKWINDOW * 1. / video.fps) # if verbose: # print("black window: ({})\n".format(black_windows.size())) # for idx, win in enumerate(black_windows.get_intervals()): # print(idx, win) # get all instances of >>, Announcer:, and >> Announcer: in transcript arrow_text = get_text_intervals(">>", transcript) announcer_text = get_text_intervals("Announcer:", transcript) arrow_announcer_text = get_text_intervals(">> Announcer:", transcript) # if verbose: # print('arrow_text', arrow_text) # print('announcer_text', announcer_text) # print('arrow_announcer_text', arrow_announcer_text) # get an interval for the whole video whole_video = IntervalList([(0., video.num_frames / video.fps, 0)]) # whole video minus black windows to get segments in between black windows # then filter out anything that overlaps with ">>" as long as it's not # ">> Announcer:" # then coalesce, as long as it doesn't get too long def fold_fn(stack, interval): if len(stack) == 0: stack.append(interval) else: last = stack.pop() if or_pred(overlaps(), after(max_dist=1), arity=2)(interval, last): if last.merge(interval).length() > MAX_COMMERCIAL_TIME: if last.length() > MAX_COMMERCIAL_TIME: stack.append( Interval(last.start, last.start + MAX_COMMERCIAL_TIME, last.payload)) else: stack.append(last) stack.append(interval) else: stack.append(last.merge(interval)) else: stack.append(last) stack.append(interval) return stack all_blocks = whole_video.minus(black_windows) non_commercial_blocks = all_blocks.filter_against( arrow_text.minus(arrow_announcer_text), predicate=overlaps()) commercial_blocks = whole_video.minus(non_commercial_blocks) if verbose: print("commercial blocks candidates: ({})\n".format( commercial_blocks.size())) for idx, win in enumerate(commercial_blocks.get_intervals()): print(idx, win) commercials = commercial_blocks \ .fold_list(fold_fn, []) \ .filter_length(min_length = MIN_COMMERCIAL_TIME) # commercials = whole_video \ # .minus(black_windows) \ # .filter_against( # arrow_text.filter_against(arrow_announcer_text, # predicate=not_pred(overlaps(), arity=2)), # predicate=not_pred(overlaps(), arity=2) # ) \ # .set_union(black_windows) \ # .fold_list(fold_fn, []) \ # .filter_length(min_length = MIN_COMMERCIAL_TIME) if verbose: print("commercials from blackwindow:\n", commercials) # add in lowercase intervals lowercase_intervals = get_lowercase_intervals(transcript) if verbose: print("lowercase intervals:\n", lowercase_intervals) commercials = commercials \ .set_union(lowercase_intervals) \ .dilate(MIN_COMMERCIAL_GAP / 2) \ .coalesce() \ .dilate(MIN_COMMERCIAL_GAP / 2) if verbose: print("commercials merge with lowercase:\n", commercials) # if verbose: # print(whole_video) # print(IntervalList([ # (start_sec - TRANSCRIPT_DELAY, end_sec - TRANSCRIPT_DELAY, 0) # for text, start_sec, end_sec in transcript # ]).coalesce().size()) # get blank intervals blank_intervals = whole_video.minus( IntervalList([ (start_sec - TRANSCRIPT_DELAY, end_sec - TRANSCRIPT_DELAY, 0) for text, start_sec, end_sec in transcript ]).coalesce()).coalesce().filter_length(min_length=MIN_BLANKWINDOW, max_length=MAX_BLANKWINDOW) if verbose: print("blank intervals:\n", blank_intervals) # add in blank intervals, but only if adding in the new intervals doesn't # get too long commercials = commercials.merge(blank_intervals, predicate=or_pred(before(max_dist=MAX_MERGE_GAP), after(max_dist=MAX_MERGE_GAP), arity=2), working_window=MAX_MERGE_GAP ) \ .filter_length(max_length=MAX_MERGE_DURATION) \ .set_union(commercials) \ .dilate(MIN_COMMERCIAL_GAP / 2) \ .coalesce() \ .dilate(MIN_COMMERCIAL_GAP / 2) if verbose: print("commercials merge with lowercase:\n", commercials) # post-process commercials to get rid of gaps, small commercials, and # islated blocks small_gaps = whole_video \ .minus(commercials) \ .filter_length(max_length = MAX_COMMERCIAL_GAP) \ .filter_against( arrow_text.filter_against( announcer_text, predicate=not_pred(overlaps()), working_window=1.0 ), predicate=not_pred(overlaps()), working_window=1.0) # merge with small gaps, but only if that doesn't make things too long commercials = commercials \ .set_union(small_gaps.dilate(0.1)) \ .coalesce() \ .filter_length(max_length=MAX_COMMERCIAL_TIME) \ .set_union(commercials) \ .coalesce() # get isolated commercials not_isolated_commercials = commercials.filter_against( commercials, predicate=or_pred(before(max_dist=MAX_COMMERCIAL_TIME), after(max_dist=MAX_COMMERCIAL_TIME), arity=2), working_window=MAX_COMMERCIAL_TIME) isolated_commercials = commercials.minus(not_isolated_commercials) commercials_to_delete = isolated_commercials \ .filter_length(max_length=MIN_COMMERCIAL_TIME_FINAL) \ .set_union(isolated_commercials \ .filter_against(blank_intervals, predicate=equal()) \ .filter_length(max_length=MAX_ISOLATED_BLANK_TIME)) commercials = commercials.minus(commercials_to_delete) return commercials
def shot_reverse_shot(): from query.models import Face from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser from rekall.merge_ops import payload_plus from esper.rekall import intrvllists_to_result_bbox from rekall.payload_predicates import payload_satisfies from rekall.list_predicates import length_at_most from rekall.logical_predicates import and_pred, or_pred from rekall.spatial_predicates import scene_graph, make_region from rekall.temporal_predicates import before, after from rekall.bbox_predicates import height_at_least from esper.rekall import intrvllists_to_result, intrvllists_to_result_with_objects, add_intrvllists_to_result # If True, visualize results in a timeline TIMELINE_OUTPUT = False RIGHT_HALF_MIN_X = 0.45 LEFT_HALF_MAX_X = 0.55 MIN_FACE_HEIGHT = 0.4 MAX_FACES_ON_SCREEN = 2 # faces are sampled every 12 frames SAMPLING_RATE = 12 ONE_SECOND = 24 FOUR_SECONDS = 96 TEN_SECONDS = 240 # Annotate face rows with start and end frames and the video ID faces = Face.objects.annotate(min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video_id')) right_half = make_region(RIGHT_HALF_MIN_X, 0.0, 1.0, 1.0) left_half = make_region(0.0, 0.0, LEFT_HALF_MAX_X, 1.0) graph = { 'nodes': [{ 'name': 'face', 'predicates': [height_at_least(MIN_FACE_HEIGHT)] }], 'edges': [] } vids = VideoIntervalCollection.from_django_qs( faces, with_payload=in_array( bbox_payload_parser( VideoIntervalCollection.django_accessor))).coalesce( payload_merge_op=payload_plus) # Get sequences where there's a face on the right half of the screen and # there are at most two faces faces_on_right = vids.filter( and_pred(payload_satisfies(length_at_most(MAX_FACES_ON_SCREEN)), payload_satisfies(scene_graph( graph, region=right_half)))).dilate(SAMPLING_RATE / 2).coalesce() # Get sequences where there's a face on the left half of the screen and # there are at most two faces faces_on_left = vids.filter( and_pred(payload_satisfies(length_at_most(MAX_FACES_ON_SCREEN)), payload_satisfies(scene_graph( graph, region=left_half)))).dilate(SAMPLING_RATE / 2).coalesce() # Sequences where faces on left up to one second before/after faces on left # Four seconds of buffer time between left-then-right/right-then-left # segments # Only keep remaining sequences that last longer than ten seconds shot_reverse_shot = faces_on_right.merge( faces_on_left, predicate=or_pred( before(max_dist=ONE_SECOND), after(max_dist=ONE_SECOND), arity=2)).dilate(FOUR_SECONDS).coalesce().dilate( -1 * FOUR_SECONDS).filter_length(min_length=TEN_SECONDS) # Post-process to display in Esper widget if TIMELINE_OUTPUT: results = intrvllists_to_result(shot_reverse_shot.get_allintervals()) add_intrvllists_to_result(results, faces_on_left.get_allintervals(), color='black') add_intrvllists_to_result(results, faces_on_right.get_allintervals(), color='green') else: results = intrvllists_to_result_with_objects( shot_reverse_shot.get_allintervals(), lambda payload, video: []) return results