def interview_with_person_x(): from query.models import LabeledCommercial, FaceIdentity from rekall.video_interval_collection import VideoIntervalCollection from rekall.temporal_predicates import before, after, overlaps from rekall.logical_predicates import or_pred from esper.rekall import intrvllists_to_result # Get list of sandbox video IDs sandbox_videos = [ row.video_id for row in LabeledCommercial.objects.distinct('video_id') ] guest_name = "bernie sanders" # Load hosts and instances of guest from SQL identities = FaceIdentity.objects.filter(face__shot__video_id__in=sandbox_videos) hosts_qs = identities.filter(face__is_host=True) guest_qs = identities.filter(identity__name=guest_name).filter(probability__gt=0.7) # Put bounding boxes in SQL hosts = VideoIntervalCollection.from_django_qs( hosts_qs.annotate(video_id=F("face__shot__video_id"), min_frame=F("face__shot__min_frame"), max_frame=F("face__shot__max_frame")) ) guest = VideoIntervalCollection.from_django_qs( guest_qs.annotate(video_id=F("face__shot__video_id"), min_frame=F("face__shot__min_frame"), max_frame=F("face__shot__max_frame")) ) # Get all shots where the guest and a host are on screen together guest_with_host = guest.overlaps(hosts).coalesce() # This temporal predicate defines A overlaps with B, or A before by less than 10 frames, # or A after B by less than 10 frames overlaps_before_or_after_pred = or_pred( or_pred(overlaps(), before(max_dist=10), arity=2), after(max_dist=10), arity=2) # This code finds sequences of: # guest with host overlaps/before/after host OR # guest with host overlaps/before/after guest interview_candidates = guest_with_host \ .merge(hosts, predicate=overlaps_before_or_after_pred) \ .set_union(guest_with_host.merge( guest, predicate=overlaps_before_or_after_pred)) \ .coalesce() # Sequences may be interrupted by shots where the guest or host don't # appear, so dilate and coalesce to merge neighboring segments interviews = interview_candidates \ .dilate(600) \ .coalesce() \ .dilate(-600) \ .filter_length(min_length=1350) # Return intervals return intrvllists_to_result(interviews.get_allintervals())
def fold_fn(stack, interval): if interval.length() > MAX_COMMERCIAL_TIME: interval = Interval(interval.start, interval.start + MAX_COMMERCIAL_TIME, interval.payload) if len(stack) == 0: stack.append(interval) else: last = stack.pop() if or_pred(overlaps(), after(max_dist=5), arity=2)(interval, last): if last.merge(interval).length() > MAX_COMMERCIAL_TIME: stack.append(Interval( last.start, last.start + MAX_COMMERCIAL_TIME, last.payload)) else: stack.append(last.merge(interval)) else: stack.append(last) stack.append(interval) return stack
def shot_reverse_shot_complex(): from query.models import Face, Shot from rekall.temporal_predicates import overlaps from rekall.merge_ops import payload_second, payload_plus from rekall.video_interval_collection import VideoIntervalCollection from rekall.interval_list import Interval, IntervalList from rekall.parsers import in_array, bbox_payload_parser from rekall.payload_predicates import payload_satisfies from rekall.list_predicates import length_at_most from rekall.logical_predicates import and_pred from rekall.spatial_predicates import scene_graph, make_region from rekall.temporal_predicates import before, after from rekall.bbox_predicates import height_at_least from esper.rekall import intrvllists_to_result_with_objects VIDEO_NAME = 'godfather part iii' MAX_FACE_MOVEMENT = 0.15 MIN_FACE_HEIGHT = 0.2 MAX_FACES_ON_SCREEN = 4 RIGHT_HALF_MIN_X = 0.33 LEFT_HALF_MAX_X = 0.66 SHOTS_LABELER_ID = 64 # faces are sampled every 12 frames SAMPLING_RATE = 12 # Annotate face rows with start and end frames and the video ID faces = Face.objects.annotate(min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video_id')).filter( frame__video__name__contains=VIDEO_NAME) shots = VideoIntervalCollection.from_django_qs(Shot.objects.filter( video__name__contains=VIDEO_NAME, labeler_id=SHOTS_LABELER_ID), with_payload=lambda obj: []) # vids are all faces for each frame vids = VideoIntervalCollection.from_django_qs( faces.filter(probability__gte=0.99), with_payload=in_array( bbox_payload_parser( VideoIntervalCollection.django_accessor))).coalesce( payload_merge_op=payload_plus) right_half = make_region(RIGHT_HALF_MIN_X, 0.0, 1.0, 1.0) left_half = make_region(0.0, 0.0, LEFT_HALF_MAX_X, 1.0) graph = { 'nodes': [{ 'name': 'face', 'predicates': [height_at_least(MIN_FACE_HEIGHT)] }], 'edges': [] } faces_on_right = vids.filter( and_pred(payload_satisfies(length_at_most(MAX_FACES_ON_SCREEN)), payload_satisfies(scene_graph(graph, region=right_half)))) faces_on_left = vids.filter( and_pred(payload_satisfies(length_at_most(MAX_FACES_ON_SCREEN)), payload_satisfies(scene_graph(graph, region=left_half)))) def wrap_list(intvl): intvl.payload = [intvl.payload] return intvl def get_height(box): return box['y2'] - box['y1'] def get_center(box): return ((box['x1'] + box['x2']) / 2, (box['y1'] + box['y2']) / 2) def get_distance(pt1, pt2): return np.sqrt((pt1[0] - pt2[0])**2 + (pt1[1] - pt2[1])**2) def find_highest_box(boxes): if len(boxes) == 0: return None result = boxes[0] best = get_height(result) for i in range(1, len(boxes)): h = get_height(boxes[i]) if h > best: best = h result = boxes[i] return result def take_highest_in_frame(intvl): result = [] for faces_in_frame in intvl.payload: largest = find_highest_box(faces_in_frame) if largest is not None: result.append(largest) intvl.payload = result return intvl # Check if displacement of box center between frames are within `dist` def inter_frame_movement_less_than(dist): def check(boxes): for b1, b2 in zip(boxes, boxes[1:]): if get_distance(get_center(b1), get_center(b2)) > dist: return False return True return check # Payload is a list, each element is a list of faces for a frame shots_with_face_on_right = shots.merge( faces_on_right, predicate=overlaps(), payload_merge_op=payload_second).map(wrap_list).coalesce( payload_merge_op=payload_plus).map(take_highest_in_frame).filter( payload_satisfies( inter_frame_movement_less_than(MAX_FACE_MOVEMENT))) shots_with_face_on_left = shots.merge( faces_on_left, predicate=overlaps(), payload_merge_op=payload_second).map(wrap_list).coalesce( payload_merge_op=payload_plus).map(take_highest_in_frame).filter( payload_satisfies( inter_frame_movement_less_than(MAX_FACE_MOVEMENT))) # Right-Left-Right sequences shot_reverse_shot_1 = shots_with_face_on_right.merge( shots_with_face_on_left, predicate=before(max_dist=1)).merge(shots_with_face_on_right, predicate=before(max_dist=1)) # Left-Right-Left sequences shot_reverse_shot_2 = shots_with_face_on_left.merge( shots_with_face_on_right, predicate=before(max_dist=1)).merge(shots_with_face_on_left, predicate=before(max_dist=1)) shot_reverse_shot = shot_reverse_shot_1.set_union( shot_reverse_shot_2).coalesce() result = intrvllists_to_result_with_objects( shot_reverse_shot.get_allintervals(), payload_to_objs=lambda p, v: []) return result
def detect_commercial_rekall(video, transcript_path, blackframe_list=None, histogram=None, verbose=True): """ API for detecting commercial blocks from TV news video using rekall @video: django query set @transcript_path: transcript_path @blackframe_list: list of black frames index @histogram: list of histogram 16x3 bin for each frame, not used if blackframe_list is provided Return: commercial_list (list of tuple((start_fid, start_sec), (end_fid, end_sec)), None if failed) """ transcript = load_transcript(transcript_path) if blackframe_list is None: blackframe_intervallist = get_blackframe_list(histogram) else: blackframe_intervallist = IntervalList([ (fid2second(fid, video.fps), fid2second(fid + 1, video.fps), 0) for fid in blackframe_list ]) black_windows = blackframe_intervallist \ .dilate(1. / video.fps) \ .coalesce() \ .dilate(-1. / video.fps) \ .filter_length(min_length=MIN_BLACKWINDOW * 1. / video.fps) # if verbose: # print("black window: ({})\n".format(black_windows.size())) # for idx, win in enumerate(black_windows.get_intervals()): # print(idx, win) # get all instances of >>, Announcer:, and >> Announcer: in transcript arrow_text = get_text_intervals(">>", transcript) announcer_text = get_text_intervals("Announcer:", transcript) arrow_announcer_text = get_text_intervals(">> Announcer:", transcript) # if verbose: # print('arrow_text', arrow_text) # print('announcer_text', announcer_text) # print('arrow_announcer_text', arrow_announcer_text) # get an interval for the whole video whole_video = IntervalList([(0., video.num_frames / video.fps, 0)]) # whole video minus black windows to get segments in between black windows # then filter out anything that overlaps with ">>" as long as it's not # ">> Announcer:" # then coalesce, as long as it doesn't get too long def fold_fn(stack, interval): if len(stack) == 0: stack.append(interval) else: last = stack.pop() if or_pred(overlaps(), after(max_dist=1), arity=2)(interval, last): if last.merge(interval).length() > MAX_COMMERCIAL_TIME: if last.length() > MAX_COMMERCIAL_TIME: stack.append( Interval(last.start, last.start + MAX_COMMERCIAL_TIME, last.payload)) else: stack.append(last) stack.append(interval) else: stack.append(last.merge(interval)) else: stack.append(last) stack.append(interval) return stack all_blocks = whole_video.minus(black_windows) non_commercial_blocks = all_blocks.filter_against( arrow_text.minus(arrow_announcer_text), predicate=overlaps()) commercial_blocks = whole_video.minus(non_commercial_blocks) if verbose: print("commercial blocks candidates: ({})\n".format( commercial_blocks.size())) for idx, win in enumerate(commercial_blocks.get_intervals()): print(idx, win) commercials = commercial_blocks \ .fold_list(fold_fn, []) \ .filter_length(min_length = MIN_COMMERCIAL_TIME) # commercials = whole_video \ # .minus(black_windows) \ # .filter_against( # arrow_text.filter_against(arrow_announcer_text, # predicate=not_pred(overlaps(), arity=2)), # predicate=not_pred(overlaps(), arity=2) # ) \ # .set_union(black_windows) \ # .fold_list(fold_fn, []) \ # .filter_length(min_length = MIN_COMMERCIAL_TIME) if verbose: print("commercials from blackwindow:\n", commercials) # add in lowercase intervals lowercase_intervals = get_lowercase_intervals(transcript) if verbose: print("lowercase intervals:\n", lowercase_intervals) commercials = commercials \ .set_union(lowercase_intervals) \ .dilate(MIN_COMMERCIAL_GAP / 2) \ .coalesce() \ .dilate(MIN_COMMERCIAL_GAP / 2) if verbose: print("commercials merge with lowercase:\n", commercials) # if verbose: # print(whole_video) # print(IntervalList([ # (start_sec - TRANSCRIPT_DELAY, end_sec - TRANSCRIPT_DELAY, 0) # for text, start_sec, end_sec in transcript # ]).coalesce().size()) # get blank intervals blank_intervals = whole_video.minus( IntervalList([ (start_sec - TRANSCRIPT_DELAY, end_sec - TRANSCRIPT_DELAY, 0) for text, start_sec, end_sec in transcript ]).coalesce()).coalesce().filter_length(min_length=MIN_BLANKWINDOW, max_length=MAX_BLANKWINDOW) if verbose: print("blank intervals:\n", blank_intervals) # add in blank intervals, but only if adding in the new intervals doesn't # get too long commercials = commercials.merge(blank_intervals, predicate=or_pred(before(max_dist=MAX_MERGE_GAP), after(max_dist=MAX_MERGE_GAP), arity=2), working_window=MAX_MERGE_GAP ) \ .filter_length(max_length=MAX_MERGE_DURATION) \ .set_union(commercials) \ .dilate(MIN_COMMERCIAL_GAP / 2) \ .coalesce() \ .dilate(MIN_COMMERCIAL_GAP / 2) if verbose: print("commercials merge with lowercase:\n", commercials) # post-process commercials to get rid of gaps, small commercials, and # islated blocks small_gaps = whole_video \ .minus(commercials) \ .filter_length(max_length = MAX_COMMERCIAL_GAP) \ .filter_against( arrow_text.filter_against( announcer_text, predicate=not_pred(overlaps()), working_window=1.0 ), predicate=not_pred(overlaps()), working_window=1.0) # merge with small gaps, but only if that doesn't make things too long commercials = commercials \ .set_union(small_gaps.dilate(0.1)) \ .coalesce() \ .filter_length(max_length=MAX_COMMERCIAL_TIME) \ .set_union(commercials) \ .coalesce() # get isolated commercials not_isolated_commercials = commercials.filter_against( commercials, predicate=or_pred(before(max_dist=MAX_COMMERCIAL_TIME), after(max_dist=MAX_COMMERCIAL_TIME), arity=2), working_window=MAX_COMMERCIAL_TIME) isolated_commercials = commercials.minus(not_isolated_commercials) commercials_to_delete = isolated_commercials \ .filter_length(max_length=MIN_COMMERCIAL_TIME_FINAL) \ .set_union(isolated_commercials \ .filter_against(blank_intervals, predicate=equal()) \ .filter_length(max_length=MAX_ISOLATED_BLANK_TIME)) commercials = commercials.minus(commercials_to_delete) return commercials
def kissing(): # Takes 7min to run! from query.models import Face, Shot from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser, merge_dict_parsers, dict_payload_parser from rekall.merge_ops import payload_plus from rekall.payload_predicates import payload_satisfies from rekall.spatial_predicates import scene_graph from rekall.temporal_predicates import overlaps from rekall.face_landmark_predicates import looking_left, looking_right from rekall.bbox_predicates import height_at_least, same_height import esper.face_landmarks_wrapper as flw from esper.captions import get_all_segments from esper.rekall import intrvllists_to_result_with_objects, bbox_to_result_object from esper.stdlib import face_landmarks_to_dict MAX_MOUTH_DIFF = 0.12 MIN_FACE_CONFIDENCE = 0.8 MIN_FACE_HEIGHT = 0.4 MAX_FACE_HEIGHT_DIFF = 0.1 MIN_FACE_OVERLAP_X = 0.05 MIN_FACE_OVERLAP_Y = 0.2 MAX_FACE_OVERLAP_X_FRACTION = 0.7 MIN_FACE_ANGLE = 0.1 def map_payload(func): def map_fn(intvl): intvl.payload = func(intvl.payload) return intvl return map_fn def get_landmarks(faces): ids = [face['id'] for face in faces] landmarks = flw.get(Face.objects.filter(id__in=ids)) for face, landmark in zip(faces, landmarks): face['landmarks'] = landmark return faces # Annotate face rows with start and end frames and the video ID faces_qs = Face.objects.filter( probability__gte=MIN_FACE_CONFIDENCE).annotate( min_frame=F('frame__number'), max_frame=F('frame__number'), height=F('bbox_y2') - F('bbox_y1'), video_id=F('frame__video_id')).filter(height__gte=MIN_FACE_HEIGHT) faces = VideoIntervalCollection.from_django_qs( faces_qs, with_payload=in_array( merge_dict_parsers([ bbox_payload_parser(VideoIntervalCollection.django_accessor), dict_payload_parser(VideoIntervalCollection.django_accessor, {'id': 'id'}) ]))).coalesce(payload_merge_op=payload_plus) graph = { 'nodes': [ { 'name': 'face_left', 'predicates': [] }, { 'name': 'face_right', 'predicates': [] }, ], 'edges': [ { 'start': 'face_left', 'end': 'face_right', 'predicates': [ lambda f1, f2: f1['x2'] < f2['x2'] and f1['x1'] < f2[ 'x1'], # Left face on the left lambda f1, f2: f1['x2'] - f2['x1'] > MIN_FACE_OVERLAP_X, # Faces overlap lambda f1, f2: min(f1['y2'], f2['y2']) - max( f1['y1'], f1['y1']) > MIN_FACE_OVERLAP_Y, lambda f1, f2: f1['y2'] > f2['y1'] and f1['y1'] < f2[ 'y2'], # No face is entirely above another same_height(MAX_FACE_HEIGHT_DIFF), lambda f1, f2: (f1['x2'] - f2['x1']) / max(f1['x2'] - f1['x1'], f2[ 'x2'] - f2['x1']) < MAX_FACE_OVERLAP_X_FRACTION ] }, ] } def mouths_are_close(lm1, lm2): select_outer = [2, 3, 4, 8, 9, 10] select_inner = [1, 2, 3, 5, 6, 7] mouth1 = np.concatenate( (lm1.outer_lips()[select_outer], lm1.inner_lips()[select_inner])) mouth2 = np.concatenate( (lm2.outer_lips()[select_outer], lm2.inner_lips()[select_inner])) mean1 = np.mean(mouth1, axis=0) mean2 = np.mean(mouth2, axis=0) return np.linalg.norm(mean1 - mean2) <= MAX_MOUTH_DIFF # Face is profile if both eyes are on the same side of the nose bridge horizontally. def is_left_profile(f): lm = f['landmarks'] nose_x = min(lm.nose_bridge()[:, 0]) left = np.all(lm.left_eye()[:, 0] >= nose_x) right = np.all(lm.right_eye()[:, 0] >= nose_x) return left and right def is_right_profile(f): lm = f['landmarks'] nose_x = max(lm.nose_bridge()[:, 0]) left = np.all(lm.left_eye()[:, 0] <= nose_x) right = np.all(lm.right_eye()[:, 0] <= nose_x) return left and right # Line is ax+by+c=0 def project_point_to_line(pt, a, b, c): x0, y0 = pt[0], pt[1] d = a * a + b * b x = (b * (b * x0 - a * y0) - a * c) / d y = (a * (-b * x0 + a * y0) - b * c) / d return np.array([x, y]) # Positive if facing right def signed_face_angle(lm): center_line_indices = [27, 28, 32, 33, 34, 51, 62, 66, 57] data = lm.landmarks[center_line_indices] fit = np.polyfit(data[:, 0], data[:, 1], 1) # y = ax+b a, b = fit[0], fit[1] A = project_point_to_line(lm.landmarks[center_line_indices[0]], a, -1, b) B = project_point_to_line(lm.landmarks[center_line_indices[-1]], a, -1, b) AB = B - A AB = AB / np.linalg.norm(AB) C = np.mean(lm.nose_bridge()[2:4], axis=0) AC = C - A AC = AC / np.linalg.norm(AC) return np.cross(AB, AC) graph2 = { 'nodes': [ { 'name': 'left', 'predicates': [ lambda f: signed_face_angle(f['landmarks']) > MIN_FACE_ANGLE # is_right_profile ] }, { 'name': 'right', 'predicates': [ lambda f: signed_face_angle(f['landmarks']) < -MIN_FACE_ANGLE # is_left_profile ] }, ], 'edges': [{ 'start': 'left', 'end': 'right', 'predicates': [ lambda l, r: mouths_are_close(l['landmarks'], r['landmarks']), ] }] } mf_up_close = faces.filter( payload_satisfies(scene_graph(graph, exact=True))).map( map_payload(get_landmarks)).filter( payload_satisfies(scene_graph(graph2, exact=True))) vids = mf_up_close.get_allintervals().keys() # Merge with shots shots_qs = Shot.objects.filter( video_id__in=vids, labeler=Labeler.objects.get(name='shot-hsvhist-face')).all() total = shots_qs.count() print("Total shots:", total) # use emtpy list as payload shots = VideoIntervalCollection.from_django_qs(shots_qs, with_payload=lambda row: [], progress=True, total=total) kissing_shots = mf_up_close.join(shots, lambda kiss, shot: [(kiss.get_start( ), shot.get_end(), kiss.get_payload())], predicate=overlaps(), working_window=1).coalesce() # Getting faces in the shot def wrap_in_list(intvl): intvl.payload = [intvl.payload] return intvl print("Getting faces...") faces_qs2 = Face.objects.filter(frame__video_id__in=vids, probability__gte=MIN_FACE_CONFIDENCE) total = faces_qs2.count() faces2 = VideoIntervalCollection.from_django_qs( faces_qs2.annotate(min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video_id')), with_payload=in_array( merge_dict_parsers([ bbox_payload_parser(VideoIntervalCollection.django_accessor), dict_payload_parser(VideoIntervalCollection.django_accessor, {'frame': 'min_frame'}) ])), progress=True, total=total).coalesce(payload_merge_op=payload_plus).map(wrap_in_list) def clip_to_last_frame_with_two_faces(intvl): faces = intvl.get_payload()[1] two_faces = [(f[0], f[1]) for f in faces if len(f) == 2] two_high_faces = [ (a, b) for a, b in two_faces if min(a['y2'] - a['y1'], b['y2'] - b['y1']) >= MIN_FACE_HEIGHT ] frame = [a['frame'] for a, b in two_high_faces] if len(frame) > 0: intvl.end = frame[-1] return intvl clipped_kissing_shots = kissing_shots.merge( faces2, payload_merge_op=lambda p1, p2: (p1, p2), predicate=overlaps(), working_window=1).coalesce( payload_merge_op=lambda p1, p2: (p1[0], p1[1] + p2[1])).map( clip_to_last_frame_with_two_faces).filter_length(min_length=12) results = get_all_segments(vids) fps_map = dict((i, Video.objects.get(id=i).fps) for i in vids) caption_results = VideoIntervalCollection({ video_id: [ ( word[0] * fps_map[video_id], # start frame word[1] * fps_map[video_id], # end frame word[2]) # payload is the word for word in words ] for video_id, words in results }) kissing_without_words = clipped_kissing_shots.minus(caption_results) kissing_final = kissing_without_words.map(lambda intvl: (int( intvl.start), int(intvl.end), intvl.payload)).coalesce().filter_length( min_length=12) def payload_to_objects(p, video_id): return [face_landmarks_to_dict(face['landmarks']) for face in p[0] ] + [bbox_to_result_object(face, video_id) for face in p[0]] return intrvllists_to_result_with_objects( kissing_final.get_allintervals(), lambda p, vid: payload_to_objects(p, vid), stride=1)
def conversations_for_display(): from query.models import FaceCharacterActor, Shot from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser, merge_dict_parsers, dict_payload_parser from rekall.merge_ops import payload_plus from rekall.payload_predicates import payload_satisfies from rekall.spatial_predicates import scene_graph from esper.rekall import intrvllists_to_result_bbox from query.models import Face from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser from rekall.merge_ops import payload_plus, merge_named_payload, payload_second from esper.rekall import intrvllists_to_result_bbox from rekall.payload_predicates import payload_satisfies from rekall.list_predicates import length_at_most from rekall.logical_predicates import and_pred, or_pred, true_pred from rekall.spatial_predicates import scene_graph, make_region from rekall.temporal_predicates import before, after, overlaps, equal from rekall.bbox_predicates import height_at_least from esper.rekall import intrvllists_to_result, intrvllists_to_result_with_objects, add_intrvllists_to_result from esper.prelude import esper_widget from rekall.interval_list import Interval, IntervalList import esper.face_embeddings as face_embeddings video_id = 15 EMBEDDING_EQUALITY_THRESHOLD = 1. ONE_FRAME = 1 faces_qs = Face.objects.annotate(min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video_id')).filter( frame__video_id=video_id, frame__regularly_sampled=True) faces_per_frame = VideoIntervalCollection.from_django_qs( faces_qs, with_payload=in_array( merge_dict_parsers([ bbox_payload_parser(VideoIntervalCollection.django_accessor), dict_payload_parser(VideoIntervalCollection.django_accessor, {'face_id': 'id'}), ]))).coalesce(payload_merge_op=payload_plus) shots_qs = Shot.objects.filter(cinematic=True) shots = VideoIntervalCollection.from_django_qs(shots_qs) shots_with_faces = shots.merge( faces_per_frame, predicate=overlaps(), payload_merge_op=lambda shot_id, faces_in_frame: (shot_id, [faces_in_frame])).coalesce( payload_merge_op=lambda p1, p2: (p1[0], p1[1] + p2[1])) def cluster_center(face_ids): # print("About to compute mean") mean_embedding = face_embeddings.mean(face_ids) # print("About to compute dist", face_ids) dists = face_embeddings.dist(face_ids, [mean_embedding]) # print("Done computing dist") return min(zip(dists, face_ids))[1] def cluster_and_compute_centers(faces_in_frame_list, shot_id): num_people = max( len(faces_in_frame) for faces_in_frame in faces_in_frame_list) face_ids = [ face['face_id'] for faces_in_frame in faces_in_frame_list for face in faces_in_frame ] face_heights = [ face['y2'] - face['y1'] for faces_in_frame in faces_in_frame_list for face in faces_in_frame ] print(num_people) if num_people == 1: clusters = [(fid, 0) for fid in face_ids] else: clusters = face_embeddings.kmeans(face_ids, num_people) # print("Done clustering") centers = [(cluster_center([ face_id for face_id, cluster_id in clusters if cluster_id == i ]), [face_id for face_id, cluster_id in clusters if cluster_id == i], shot_id, max([ face_heights[face_ids.index(face_id)] for face_id, cluster_id in clusters if cluster_id == i ])) for i in range(num_people)] # print("Done computing the center") return centers # print("About to compute clusters") shots_with_centers = shots_with_faces.map(lambda intrvl: ( intrvl.start, intrvl.end, (intrvl.payload[0], cluster_and_compute_centers(intrvl.payload[1], intrvl.payload[0])))) # print("Clusters computed") def same_face(center1, center2): return face_embeddings.dist( [center1], target_ids=[center2])[0] < EMBEDDING_EQUALITY_THRESHOLD def cross_product_faces(intrvl1, intrvl2): payload1 = intrvl1.get_payload() payload2 = intrvl2.get_payload() payload = [] for cluster1 in payload1[1]: for cluster2 in payload2[1]: if not same_face(cluster1[0], cluster2[0]): new_payload = {'A': cluster1, 'B': cluster2} payload.append(new_payload) return [(min(intrvl1.get_start(), intrvl2.get_start()), max(intrvl1.get_end(), intrvl2.get_end()), { 'chrs': payload, 'shots': [payload1[0], payload2[0]] })] two_shots = shots_with_centers.join(shots_with_centers, predicate=after(max_dist=ONE_FRAME, min_dist=ONE_FRAME), merge_op=cross_product_faces) # print("Cross product done") def faces_equal(payload1, payload2): for face_pair1 in payload1['chrs']: for face_pair2 in payload2['chrs']: if (same_face(face_pair1['A'][0], face_pair2['A'][0]) and same_face(face_pair1['B'][0], face_pair2['B'][0])): return True if (same_face(face_pair1['A'][0], face_pair2['B'][0]) and same_face(face_pair1['B'][0], face_pair2['A'][0])): return True return False convs = two_shots.coalesce( predicate=payload_satisfies(faces_equal, arity=2), payload_merge_op=lambda payload1, payload2: { 'chrs': payload1['chrs'] + payload2['chrs'], 'shots': payload1['shots'] + payload2['shots'] }) # print("Coalesce done") adjacent_seq = convs.merge( convs, predicate=and_pred(after(max_dist=ONE_FRAME, min_dist=ONE_FRAME), payload_satisfies(faces_equal, arity=2), arity=2), payload_merge_op=lambda payload1, payload2: { 'chrs': payload1['chrs'] + payload2['chrs'], 'shots': payload1['shots'] + payload2['shots'] }, working_window=1) convs = convs.set_union(adjacent_seq) # convs = convs.coalesce(predicate=times_equal, payload_merge_op=shots_equal) # print("Two-shot adjacencies done") def filter_fn(intvl): payload = intvl.get_payload() if type(payload) is dict and 'shots' in payload: return len(set(payload['shots'])) >= 3 return False convs = convs.filter(filter_fn) convs = convs.coalesce() # print("Final filter done") # for video_id in convs.intervals.keys(): # print(video_id) # intvllist = convs.get_intervallist(video_id) # for intvl in intvllist.get_intervals(): # print(intvl.payload) # print(str(intvl.start) + ':' + str(intvl.end)) return intervallists_to_result_with_objects(convs, lambda a, b: [])
def reaction_shots_apollo_13(): from rekall.video_interval_collection import VideoIntervalCollection from rekall.merge_ops import payload_plus from rekall.payload_predicates import payload_satisfies from rekall.temporal_predicates import overlaps from rekall.parsers import in_array, merge_dict_parsers, bbox_payload_parser, dict_payload_parser from esper.caption_metadata import caption_metadata_for_video from esper.captions import get_all_segments from esper.rekall import intrvllists_to_result_with_objects from query.models import FaceCharacterActor, Shot videos = Video.objects.filter(name__contains="apollo 13").all() # Load script data metadata = VideoIntervalCollection({ video.id: caption_metadata_for_video(video.id) for video in videos }).filter(lambda meta_interval: (meta_interval.payload['speaker'] is not None and "man's voice" not in meta_interval.payload['speaker'] and meta_interval. payload['speaker'].strip() != "gene krantz")) all_segments = get_all_segments([video.id for video in videos]) captions_interval_collection = VideoIntervalCollection( {video: intervals for video, intervals in all_segments}) captions_with_speaker_id = captions_interval_collection.overlaps( metadata.filter(payload_satisfies(lambda p: p['aligned'])), payload_merge_op=lambda word, script_meta: (word[0], script_meta['speaker'])) # Annotate face rows with start and end frames and the video ID faces_with_character_actor_qs = FaceCharacterActor.objects.annotate( min_frame=F('face__frame__number'), max_frame=F('face__frame__number'), video_id=F('face__frame__video_id'), character_name=F('characteractor__character__name')).filter( video_id__in=[v.id for v in videos]) frames_with_identity = VideoIntervalCollection.from_django_qs( faces_with_character_actor_qs, with_payload=in_array( dict_payload_parser(VideoIntervalCollection.django_accessor, {'character': 'character_name'}), )).coalesce( payload_merge_op=payload_plus) # Annotate shots with all the people in them shots_qs = Shot.objects.filter( cinematic=True, video_id__in=[v.id for v in videos]).annotate(fps=F('video__fps')) shots = VideoIntervalCollection.from_django_qs( shots_qs, with_payload=lambda shot: shot.fps) # Annotate shots with mode shot scale frames_with_shot_scale_qs = Frame.objects.filter( regularly_sampled=True, video_id__in=[v.id for v in videos ]).annotate(min_frame=F('number'), max_frame=F('number'), shot_scale_name=F('shot_scale__name')).all() frames_with_shot_scale = VideoIntervalCollection.from_django_qs( frames_with_shot_scale_qs, with_payload=lambda f: f.shot_scale_name) def get_mode(items): return max(set(items), key=items.count) shots_with_scale = shots.merge( frames_with_shot_scale, predicate=overlaps(), payload_merge_op=lambda shot_fps, shot_scale: [(shot_fps, shot_scale)] ).coalesce(payload_merge_op=payload_plus).map( lambda intrvl: (intrvl.start, intrvl.end, { 'fps': intrvl.payload[0][0], 'shot_scale': get_mode([p[1] for p in intrvl.payload]) })) shots_with_people_in_them = shots_with_scale.overlaps( frames_with_identity, payload_merge_op=lambda shot_payload, identities: (shot_payload, identities), working_window=1).coalesce(payload_merge_op=lambda p1, p2: (p1[0], p1[ 1] + p2[1])).map(lambda intrvl: (intrvl.start / intrvl.payload[0][ 'fps'], intrvl.end / intrvl.payload[0]['fps'], { 'fps': intrvl.payload[0]['fps'], 'shot_scale': intrvl.payload[0]['shot_scale'], 'characters': set([ name.strip().split(' ')[0].strip() for d in intrvl. payload[1] for name in d['character'].split('/') if len(name.strip()) > 0 ]) })) reaction_shots = captions_with_speaker_id.overlaps( shots_with_people_in_them.filter( payload_satisfies( lambda p: p['shot_scale'] in ['medium_close_up', 'close_up', 'extreme_close_up'])), predicate=lambda captions, shots: captions.payload[1].strip().split( ' ')[0] not in shots.payload['characters'], payload_merge_op=lambda word_and_speaker, fps_and_characters: (fps_and_characters['fps'], word_and_speaker)).map(lambda intrvl: ( int(intrvl.start * intrvl.payload[0]), int(intrvl.end * intrvl.payload[0]), [intrvl.payload[1]])).dilate( 12).coalesce( payload_merge_op=payload_plus).dilate(-12).filter_length( min_length=12) return intrvllists_to_result_with_objects(reaction_shots, lambda a, b: [])
def get_shots_for_video(vid): shots_qs = Shot.objects.filter( video__id=vid, labeler=Labeler.objects.get(name='shot-hsvhist-face') ).all() total = shots_qs.count() print("Total shots:", total) shots = VideoIntervalCollection.from_django_qs( shots_qs, with_payload=lambda row:[], progress=False, total=total ) # Take all sampled frames: every 12th. frames_qs = Frame.objects.filter(video__id=vid).annotate(numbermod=F('number') % 12).filter( numbermod=0).annotate(scale=F("shot_scale__name")) total = frames_qs.count() print("Total frames with scale:", total) shot_scales = VideoIntervalCollection.from_django_qs( frames_qs, schema={ "start": "number", "end": "number", }, with_payload=lambda f: [ShotScaleEnum[f.scale.upper()]], progress=False, total=total) # Take all poses poses_qs = PoseMeta.objects.filter(frame__video__id=vid).annotate( min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video_id') ) total = poses_qs.count() print("Total Poses:", total) poses = VideoIntervalCollection.from_django_qs( poses_qs, with_payload=lambda row: [row], progress=False, total=total ).coalesce(payload_merge_op=payload_plus) print("Merging scales into shots") # Merge scales into shots shots_with_scale = shots.merge( shot_scales, payload_merge_op = payload_second, predicate=overlaps(), working_window=1 ).coalesce( payload_merge_op=payload_plus ).map( lambda shot_interval: (shot_interval.get_start(), shot_interval.get_end(), {"scale": scale_for_shot(shot_interval.get_payload())}) ) print("Merging poses into shots") # Merge poses into shots shots_with_poses = shots.merge( poses.map(lambda shot_interval: (shot_interval.get_start(), shot_interval.get_end(), [shot_interval.get_payload()])), payload_merge_op = payload_second, predicate=overlaps(), working_window=1 ).coalesce( # Get a list of list of poses for each shot payload_merge_op = payload_plus ).map(lambda shot_interval: (shot_interval.get_start(), shot_interval.get_end(), {"poses": poses_for_shot(shot_interval.get_payload())})) print("Computing shot features") # Get shots with shot features shots = shots_with_scale.merge( shots_with_poses, payload_merge_op = lambda d1, d2: {**d1,**d2}, predicate=overlaps(), working_window=1 ).coalesce().map( lambda intv: (intv.get_start(), intv.get_end(), ShotFeatures( intv.get_payload()["scale"], intv.get_payload()["poses"]))) return shots
def detect_commercial_rekall(video, transcript_path, blackframe_list=None, histogram=None, debug=True, verbose=False): """ API for detecting commercial blocks from TV news video using rekall @video: django query set @transcript_path: transcript_path @blackframe_list: list of black frames index @histogram: list of histogram 16x3 bin for each frame, not used if blackframe_list is provided Return: commercial_list (list of tuple((start_fid, start_sec), (end_fid, end_sec)), None if failed) """ if type(video) == dict: fps = video['fps'] video_length = video['num_frames'] / fps else: fps = video.fps video_length = video.num_frames / video.fps transcript = load_transcript(transcript_path) if blackframe_list is None: blackframe_intervallist = get_blackframe_list(histogram) else: blackframe_intervallist = IntervalList([(fid2second(fid, fps), fid2second(fid + 1, fps), 0) for fid in blackframe_list]) # get black windows black_windows = blackframe_intervallist \ .dilate(1. / fps) \ .coalesce() \ .dilate(-1. / fps) # .filter_length(min_length=MIN_BLACKWINDOW * 1. / video.fps) if verbose: print("black window: ({})\n".format(black_windows.size())) for idx, win in enumerate(black_windows.get_intervals()): print(idx, win) # get all instances of >> arrow_intervals = get_text_intervals(">>", transcript) arrow_announcer_intervals = get_text_intervals(">> Announcer:", transcript) arrow_having_intervals = get_text_intervals(">> HAVING", transcript) if verbose: print("arrow_text: ({})\n".format(arrow_intervals.size()), arrow_intervals) print("arrow_announcer_text: ({})\n".format(arrow_announcer_intervals.size()), arrow_announcer_intervals) # get intervals for the whole transcript transcript_intervals = IntervalList([ (start_sec, end_sec, 0) for text, start_sec, end_sec in transcript if not '{' in text ]).dilate(1) \ .coalesce() \ .dilate(-1) \ # get an interval for the whole video whole_video = IntervalList([(0., video_length, 0)]) # whole video minus black windows to get segments in between black windows # then filter out anything that overlaps with ">>" as long as it's not ">> Announcer:" # then coalesce, as long as it doesn't get too long def fold_fn(stack, interval): if interval.length() > MAX_COMMERCIAL_TIME: interval = Interval(interval.start, interval.start + MAX_COMMERCIAL_TIME, interval.payload) if len(stack) == 0: stack.append(interval) else: last = stack.pop() if or_pred(overlaps(), after(max_dist=5), arity=2)(interval, last): if last.merge(interval).length() > MAX_COMMERCIAL_TIME: stack.append(Interval( last.start, last.start + MAX_COMMERCIAL_TIME, last.payload)) else: stack.append(last.merge(interval)) else: stack.append(last) stack.append(interval) return stack # get reliable double arrow intervals reliable_transcripts = transcript_intervals.filter_length(min_length=RELIABLE_TEXT_DURATION) arrow_intervals = arrow_intervals \ .minus(arrow_announcer_intervals) \ .minus(arrow_having_intervals) \ .filter_against( reliable_transcripts, predicate=overlaps() ) # get non-commercial blocks by filtering out intervals overlaps with >> all_blocks = whole_video.minus(black_windows) non_commercial_blocks = all_blocks.filter_against( arrow_intervals, predicate=overlaps() ) commercial_blocks = whole_video.minus(non_commercial_blocks.set_union(black_windows)) if verbose: print("commercial blocks candidates: ({})\n".format(commercial_blocks.size())) for idx, win in enumerate(commercial_blocks.get_intervals()): print(idx, win) commercials = commercial_blocks \ .fold_list(fold_fn, []) \ .filter_length(min_length = MIN_COMMERCIAL_TIME) commercials_raw = copy.deepcopy(commercials) if verbose: print("commercials from blackwindow:\n", commercials) # add in lowercase intervals lowercase_intervals = get_lowercase_intervals(transcript) if verbose: print("lowercase intervals:\n", lowercase_intervals) commercials = commercials.set_union(lowercase_intervals) if verbose: print("commercials merge with lowercase:\n", commercials) # get blank intervals blank_intervals = whole_video \ .minus(transcript_intervals) \ .filter_length(min_length=MIN_BLANKWINDOW, max_length=MAX_BLANKWINDOW) # remove last one minute segment due to no aligned transcripts blank_intervals = blank_intervals \ .minus(IntervalList([(video_length-60, video_length, 0)])) \ .filter_length(min_length=MIN_BLANKWINDOW) if verbose: print("blank intervals:\n", blank_intervals) # add in blank intervals commercials = commercials.set_union(blank_intervals) # commercials = commercials.merge(blank_intervals, # predicate=or_pred(before(max_dist=MAX_MERGE_GAP), # after(max_dist=MAX_MERGE_GAP), arity=2), # working_window=MAX_MERGE_GAP # ) \ # .filter_length(max_length=MAX_MERGE_DURATION) \ # .set_union(commercials) \ # .dilate(MIN_COMMERCIAL_GAP / 2) \ # .coalesce() \ # .dilate(-MIN_COMMERCIAL_GAP / 2) if verbose: print("commercials merge with blank intervals:\n", commercials) # merge with small gaps, but only if that doesn't make things too long commercials = commercials \ .dilate(MAX_MERGE_GAP / 2) \ .coalesce() \ .dilate(-MAX_MERGE_GAP / 2) \ .filter_length(max_length=MAX_COMMERCIAL_TIME) \ .set_union(commercials_raw) \ .set_union(lowercase_intervals) \ .set_union(blank_intervals) \ .coalesce() # # post-process commercials to get rid of gaps, small commercials, and # # islated blocks # small_gaps = whole_video \ # .minus(commercials) \ # .filter_length(max_length = MAX_COMMERCIAL_GAP) \ # .filter_against( # arrow_text.filter_against( # announcer_text, # predicate=not_pred(overlaps()), # working_window=1.0 # ), predicate=not_pred(overlaps()), # working_window=1.0) # # merge with small gaps, but only if that doesn't make things too long # commercials = commercials \ # .set_union(small_gaps.dilate(0.1)) \ # .coalesce() \ # .filter_length(max_length=MAX_COMMERCIAL_TIME) \ # .set_union(commercials) \ # .coalesce() # # get isolated commercials # not_isolated_commercials = commercials.filter_against(commercials, # predicate=or_pred(before(max_dist=MAX_COMMERCIAL_TIME), # after(max_dist=MAX_COMMERCIAL_TIME), arity=2), # working_window=MAX_COMMERCIAL_TIME) # isolated_commercials = commercials.minus(not_isolated_commercials) # commercials_to_delete = isolated_commercials \ # .filter_length(max_length=MIN_COMMERCIAL_TIME_FINAL) \ # .set_union(isolated_commercials \ # .filter_against(blank_intervals, predicate=equal()) \ # .filter_length(max_length=MAX_ISOLATED_BLANK_TIME)) # commercials = commercials.minus(commercials_to_delete) if debug: result = {'black': black_windows.dilate(2), 'arrow': arrow_intervals.dilate(2), 'commercials_raw': commercials_raw, 'lowercase': lowercase_intervals, 'blank': blank_intervals, 'commercials': commercials, } return result else: result = [(i.start, i.end) for i in commercials.get_intervals()] return result