예제 #1
0
파일: tvnews.py 프로젝트: DanFu09/esper
def interview_with_person_x():
    from query.models import LabeledCommercial, FaceIdentity
    from rekall.video_interval_collection import VideoIntervalCollection
    from rekall.temporal_predicates import before, after, overlaps
    from rekall.logical_predicates import or_pred
    from esper.rekall import intrvllists_to_result

    # Get list of sandbox video IDs
    sandbox_videos = [
        row.video_id
        for row in LabeledCommercial.objects.distinct('video_id')
    ]

    guest_name = "bernie sanders"

    # Load hosts and instances of guest from SQL
    identities = FaceIdentity.objects.filter(face__shot__video_id__in=sandbox_videos)
    hosts_qs = identities.filter(face__is_host=True)
    guest_qs = identities.filter(identity__name=guest_name).filter(probability__gt=0.7)

    # Put bounding boxes in SQL
    hosts = VideoIntervalCollection.from_django_qs(
        hosts_qs.annotate(video_id=F("face__shot__video_id"),
            min_frame=F("face__shot__min_frame"),
            max_frame=F("face__shot__max_frame"))
        )
    guest = VideoIntervalCollection.from_django_qs(
        guest_qs.annotate(video_id=F("face__shot__video_id"),
        min_frame=F("face__shot__min_frame"),
        max_frame=F("face__shot__max_frame"))
    )

    # Get all shots where the guest and a host are on screen together
    guest_with_host = guest.overlaps(hosts).coalesce()

    # This temporal predicate defines A overlaps with B, or A before by less than 10 frames,
    #   or A after B by less than 10 frames
    overlaps_before_or_after_pred = or_pred(
            or_pred(overlaps(), before(max_dist=10), arity=2),
            after(max_dist=10), arity=2)

    # This code finds sequences of:
    #   guest with host overlaps/before/after host OR
    #   guest with host overlaps/before/after guest
    interview_candidates = guest_with_host \
            .merge(hosts, predicate=overlaps_before_or_after_pred) \
            .set_union(guest_with_host.merge(
                guest, predicate=overlaps_before_or_after_pred)) \
            .coalesce()

    # Sequences may be interrupted by shots where the guest or host don't
    #   appear, so dilate and coalesce to merge neighboring segments
    interviews = interview_candidates \
            .dilate(600) \
            .coalesce() \
            .dilate(-600) \
            .filter_length(min_length=1350)

    # Return intervals
    return intrvllists_to_result(interviews.get_allintervals())
예제 #2
0
 def fold_fn(stack, interval):
     if interval.length() > MAX_COMMERCIAL_TIME:
         interval = Interval(interval.start, interval.start + MAX_COMMERCIAL_TIME, interval.payload)
     if len(stack) == 0:
         stack.append(interval)
     else:
         last = stack.pop()
         if or_pred(overlaps(), after(max_dist=5), arity=2)(interval, last):
             if last.merge(interval).length() > MAX_COMMERCIAL_TIME:
                 stack.append(Interval(
                     last.start, 
                     last.start + MAX_COMMERCIAL_TIME, 
                     last.payload))
             else:
                 stack.append(last.merge(interval))
         else:
             stack.append(last)
             stack.append(interval)
     return stack
예제 #3
0
def shot_reverse_shot_complex():
    from query.models import Face, Shot
    from rekall.temporal_predicates import overlaps
    from rekall.merge_ops import payload_second, payload_plus
    from rekall.video_interval_collection import VideoIntervalCollection
    from rekall.interval_list import Interval, IntervalList
    from rekall.parsers import in_array, bbox_payload_parser
    from rekall.payload_predicates import payload_satisfies
    from rekall.list_predicates import length_at_most
    from rekall.logical_predicates import and_pred
    from rekall.spatial_predicates import scene_graph, make_region
    from rekall.temporal_predicates import before, after
    from rekall.bbox_predicates import height_at_least
    from esper.rekall import intrvllists_to_result_with_objects

    VIDEO_NAME = 'godfather part iii'

    MAX_FACE_MOVEMENT = 0.15
    MIN_FACE_HEIGHT = 0.2
    MAX_FACES_ON_SCREEN = 4
    RIGHT_HALF_MIN_X = 0.33
    LEFT_HALF_MAX_X = 0.66
    SHOTS_LABELER_ID = 64
    # faces are sampled every 12 frames
    SAMPLING_RATE = 12
    # Annotate face rows with start and end frames and the video ID
    faces = Face.objects.annotate(min_frame=F('frame__number'),
                                  max_frame=F('frame__number'),
                                  video_id=F('frame__video_id')).filter(
                                      frame__video__name__contains=VIDEO_NAME)

    shots = VideoIntervalCollection.from_django_qs(Shot.objects.filter(
        video__name__contains=VIDEO_NAME, labeler_id=SHOTS_LABELER_ID),
                                                   with_payload=lambda obj: [])
    # vids are all faces for each frame
    vids = VideoIntervalCollection.from_django_qs(
        faces.filter(probability__gte=0.99),
        with_payload=in_array(
            bbox_payload_parser(
                VideoIntervalCollection.django_accessor))).coalesce(
                    payload_merge_op=payload_plus)

    right_half = make_region(RIGHT_HALF_MIN_X, 0.0, 1.0, 1.0)
    left_half = make_region(0.0, 0.0, LEFT_HALF_MAX_X, 1.0)
    graph = {
        'nodes': [{
            'name': 'face',
            'predicates': [height_at_least(MIN_FACE_HEIGHT)]
        }],
        'edges': []
    }

    faces_on_right = vids.filter(
        and_pred(payload_satisfies(length_at_most(MAX_FACES_ON_SCREEN)),
                 payload_satisfies(scene_graph(graph, region=right_half))))
    faces_on_left = vids.filter(
        and_pred(payload_satisfies(length_at_most(MAX_FACES_ON_SCREEN)),
                 payload_satisfies(scene_graph(graph, region=left_half))))

    def wrap_list(intvl):
        intvl.payload = [intvl.payload]
        return intvl

    def get_height(box):
        return box['y2'] - box['y1']

    def get_center(box):
        return ((box['x1'] + box['x2']) / 2, (box['y1'] + box['y2']) / 2)

    def get_distance(pt1, pt2):
        return np.sqrt((pt1[0] - pt2[0])**2 + (pt1[1] - pt2[1])**2)

    def find_highest_box(boxes):
        if len(boxes) == 0:
            return None
        result = boxes[0]
        best = get_height(result)
        for i in range(1, len(boxes)):
            h = get_height(boxes[i])
            if h > best:
                best = h
                result = boxes[i]
        return result

    def take_highest_in_frame(intvl):
        result = []
        for faces_in_frame in intvl.payload:
            largest = find_highest_box(faces_in_frame)
            if largest is not None:
                result.append(largest)
        intvl.payload = result
        return intvl

    # Check if displacement of box center between frames are within `dist`
    def inter_frame_movement_less_than(dist):
        def check(boxes):
            for b1, b2 in zip(boxes, boxes[1:]):
                if get_distance(get_center(b1), get_center(b2)) > dist:
                    return False
            return True

        return check

    # Payload is a list, each element is a list of faces for a frame
    shots_with_face_on_right = shots.merge(
        faces_on_right, predicate=overlaps(),
        payload_merge_op=payload_second).map(wrap_list).coalesce(
            payload_merge_op=payload_plus).map(take_highest_in_frame).filter(
                payload_satisfies(
                    inter_frame_movement_less_than(MAX_FACE_MOVEMENT)))
    shots_with_face_on_left = shots.merge(
        faces_on_left, predicate=overlaps(),
        payload_merge_op=payload_second).map(wrap_list).coalesce(
            payload_merge_op=payload_plus).map(take_highest_in_frame).filter(
                payload_satisfies(
                    inter_frame_movement_less_than(MAX_FACE_MOVEMENT)))

    # Right-Left-Right sequences
    shot_reverse_shot_1 = shots_with_face_on_right.merge(
        shots_with_face_on_left,
        predicate=before(max_dist=1)).merge(shots_with_face_on_right,
                                            predicate=before(max_dist=1))

    # Left-Right-Left sequences
    shot_reverse_shot_2 = shots_with_face_on_left.merge(
        shots_with_face_on_right,
        predicate=before(max_dist=1)).merge(shots_with_face_on_left,
                                            predicate=before(max_dist=1))

    shot_reverse_shot = shot_reverse_shot_1.set_union(
        shot_reverse_shot_2).coalesce()
    result = intrvllists_to_result_with_objects(
        shot_reverse_shot.get_allintervals(), payload_to_objs=lambda p, v: [])
    return result
예제 #4
0
def detect_commercial_rekall(video,
                             transcript_path,
                             blackframe_list=None,
                             histogram=None,
                             verbose=True):
    """
    API for detecting commercial blocks from TV news video using rekall
    
    @video: django query set
    @transcript_path: transcript_path
    @blackframe_list: list of black frames index
    @histogram: list of histogram 16x3 bin for each frame, not used if blackframe_list is provided  
    
    Return: commercial_list (list of tuple((start_fid, start_sec), (end_fid, end_sec)), None if failed)
    """

    transcript = load_transcript(transcript_path)
    if blackframe_list is None:
        blackframe_intervallist = get_blackframe_list(histogram)
    else:
        blackframe_intervallist = IntervalList([
            (fid2second(fid, video.fps), fid2second(fid + 1, video.fps), 0)
            for fid in blackframe_list
        ])

    black_windows = blackframe_intervallist \
            .dilate(1. / video.fps) \
            .coalesce() \
            .dilate(-1. / video.fps) \
            .filter_length(min_length=MIN_BLACKWINDOW * 1. / video.fps)
    #     if verbose:
    #         print("black window: ({})\n".format(black_windows.size()))
    #         for idx, win in enumerate(black_windows.get_intervals()):
    #             print(idx, win)

    # get all instances of >>, Announcer:, and  >> Announcer: in transcript
    arrow_text = get_text_intervals(">>", transcript)
    announcer_text = get_text_intervals("Announcer:", transcript)
    arrow_announcer_text = get_text_intervals(">> Announcer:", transcript)
    #     if verbose:
    #         print('arrow_text', arrow_text)
    #         print('announcer_text', announcer_text)
    #         print('arrow_announcer_text', arrow_announcer_text)

    # get an interval for the whole video
    whole_video = IntervalList([(0., video.num_frames / video.fps, 0)])

    # whole video minus black windows to get segments in between black windows
    # then filter out anything that overlaps with ">>" as long as it's not
    #   ">> Announcer:"
    # then coalesce, as long as it doesn't get too long
    def fold_fn(stack, interval):
        if len(stack) == 0:
            stack.append(interval)
        else:
            last = stack.pop()
            if or_pred(overlaps(), after(max_dist=1), arity=2)(interval, last):
                if last.merge(interval).length() > MAX_COMMERCIAL_TIME:
                    if last.length() > MAX_COMMERCIAL_TIME:
                        stack.append(
                            Interval(last.start,
                                     last.start + MAX_COMMERCIAL_TIME,
                                     last.payload))
                    else:
                        stack.append(last)
                    stack.append(interval)
                else:
                    stack.append(last.merge(interval))
            else:
                stack.append(last)
                stack.append(interval)
        return stack

    all_blocks = whole_video.minus(black_windows)
    non_commercial_blocks = all_blocks.filter_against(
        arrow_text.minus(arrow_announcer_text), predicate=overlaps())
    commercial_blocks = whole_video.minus(non_commercial_blocks)
    if verbose:
        print("commercial blocks candidates: ({})\n".format(
            commercial_blocks.size()))
        for idx, win in enumerate(commercial_blocks.get_intervals()):
            print(idx, win)

    commercials = commercial_blocks \
        .fold_list(fold_fn, []) \
        .filter_length(min_length = MIN_COMMERCIAL_TIME)
    #     commercials = whole_video \
    #             .minus(black_windows) \
    #             .filter_against(
    #                 arrow_text.filter_against(arrow_announcer_text,
    #                     predicate=not_pred(overlaps(), arity=2)),
    #                 predicate=not_pred(overlaps(), arity=2)
    #             ) \
    #             .set_union(black_windows) \
    #             .fold_list(fold_fn, []) \
    #             .filter_length(min_length = MIN_COMMERCIAL_TIME)

    if verbose:
        print("commercials from blackwindow:\n", commercials)

    # add in lowercase intervals
    lowercase_intervals = get_lowercase_intervals(transcript)
    if verbose:
        print("lowercase intervals:\n", lowercase_intervals)
    commercials = commercials \
            .set_union(lowercase_intervals) \
            .dilate(MIN_COMMERCIAL_GAP / 2) \
            .coalesce() \
            .dilate(MIN_COMMERCIAL_GAP / 2)
    if verbose:
        print("commercials merge with lowercase:\n", commercials)

#     if verbose:
#         print(whole_video)
#         print(IntervalList([
#             (start_sec - TRANSCRIPT_DELAY, end_sec - TRANSCRIPT_DELAY, 0)
#             for text, start_sec, end_sec in transcript
#         ]).coalesce().size())

# get blank intervals
    blank_intervals = whole_video.minus(
        IntervalList([
            (start_sec - TRANSCRIPT_DELAY, end_sec - TRANSCRIPT_DELAY, 0)
            for text, start_sec, end_sec in transcript
        ]).coalesce()).coalesce().filter_length(min_length=MIN_BLANKWINDOW,
                                                max_length=MAX_BLANKWINDOW)

    if verbose:
        print("blank intervals:\n", blank_intervals)

    # add in blank intervals, but only if adding in the new intervals doesn't
    #   get too long
    commercials = commercials.merge(blank_intervals,
            predicate=or_pred(before(max_dist=MAX_MERGE_GAP),
                after(max_dist=MAX_MERGE_GAP), arity=2),
            working_window=MAX_MERGE_GAP
            ) \
            .filter_length(max_length=MAX_MERGE_DURATION) \
            .set_union(commercials) \
            .dilate(MIN_COMMERCIAL_GAP / 2) \
            .coalesce() \
            .dilate(MIN_COMMERCIAL_GAP / 2)
    if verbose:
        print("commercials merge with lowercase:\n", commercials)

    # post-process commercials to get rid of gaps, small commercials, and
    #   islated blocks
    small_gaps = whole_video \
            .minus(commercials) \
            .filter_length(max_length = MAX_COMMERCIAL_GAP) \
            .filter_against(
                    arrow_text.filter_against(
                        announcer_text,
                        predicate=not_pred(overlaps()),
                        working_window=1.0
                    ), predicate=not_pred(overlaps()),
                    working_window=1.0)

    # merge with small gaps, but only if that doesn't make things too long
    commercials = commercials \
            .set_union(small_gaps.dilate(0.1)) \
            .coalesce() \
            .filter_length(max_length=MAX_COMMERCIAL_TIME) \
            .set_union(commercials) \
            .coalesce()

    # get isolated commercials
    not_isolated_commercials = commercials.filter_against(
        commercials,
        predicate=or_pred(before(max_dist=MAX_COMMERCIAL_TIME),
                          after(max_dist=MAX_COMMERCIAL_TIME),
                          arity=2),
        working_window=MAX_COMMERCIAL_TIME)
    isolated_commercials = commercials.minus(not_isolated_commercials)
    commercials_to_delete = isolated_commercials \
            .filter_length(max_length=MIN_COMMERCIAL_TIME_FINAL) \
            .set_union(isolated_commercials \
                .filter_against(blank_intervals, predicate=equal()) \
                .filter_length(max_length=MAX_ISOLATED_BLANK_TIME))

    commercials = commercials.minus(commercials_to_delete)

    return commercials
예제 #5
0
def kissing():
    # Takes 7min to run!
    from query.models import Face, Shot
    from rekall.video_interval_collection import VideoIntervalCollection
    from rekall.parsers import in_array, bbox_payload_parser, merge_dict_parsers, dict_payload_parser
    from rekall.merge_ops import payload_plus
    from rekall.payload_predicates import payload_satisfies
    from rekall.spatial_predicates import scene_graph
    from rekall.temporal_predicates import overlaps
    from rekall.face_landmark_predicates import looking_left, looking_right
    from rekall.bbox_predicates import height_at_least, same_height
    import esper.face_landmarks_wrapper as flw
    from esper.captions import get_all_segments
    from esper.rekall import intrvllists_to_result_with_objects, bbox_to_result_object
    from esper.stdlib import face_landmarks_to_dict

    MAX_MOUTH_DIFF = 0.12
    MIN_FACE_CONFIDENCE = 0.8
    MIN_FACE_HEIGHT = 0.4
    MAX_FACE_HEIGHT_DIFF = 0.1
    MIN_FACE_OVERLAP_X = 0.05
    MIN_FACE_OVERLAP_Y = 0.2
    MAX_FACE_OVERLAP_X_FRACTION = 0.7
    MIN_FACE_ANGLE = 0.1

    def map_payload(func):
        def map_fn(intvl):
            intvl.payload = func(intvl.payload)
            return intvl

        return map_fn

    def get_landmarks(faces):
        ids = [face['id'] for face in faces]
        landmarks = flw.get(Face.objects.filter(id__in=ids))
        for face, landmark in zip(faces, landmarks):
            face['landmarks'] = landmark
        return faces

    # Annotate face rows with start and end frames and the video ID
    faces_qs = Face.objects.filter(
        probability__gte=MIN_FACE_CONFIDENCE).annotate(
            min_frame=F('frame__number'),
            max_frame=F('frame__number'),
            height=F('bbox_y2') - F('bbox_y1'),
            video_id=F('frame__video_id')).filter(height__gte=MIN_FACE_HEIGHT)

    faces = VideoIntervalCollection.from_django_qs(
        faces_qs,
        with_payload=in_array(
            merge_dict_parsers([
                bbox_payload_parser(VideoIntervalCollection.django_accessor),
                dict_payload_parser(VideoIntervalCollection.django_accessor,
                                    {'id': 'id'})
            ]))).coalesce(payload_merge_op=payload_plus)

    graph = {
        'nodes': [
            {
                'name': 'face_left',
                'predicates': []
            },
            {
                'name': 'face_right',
                'predicates': []
            },
        ],
        'edges': [
            {
                'start':
                'face_left',
                'end':
                'face_right',
                'predicates': [
                    lambda f1, f2: f1['x2'] < f2['x2'] and f1['x1'] < f2[
                        'x1'],  # Left face on the left
                    lambda f1, f2: f1['x2'] - f2['x1'] >
                    MIN_FACE_OVERLAP_X,  # Faces overlap
                    lambda f1, f2: min(f1['y2'], f2['y2']) - max(
                        f1['y1'], f1['y1']) > MIN_FACE_OVERLAP_Y,
                    lambda f1, f2: f1['y2'] > f2['y1'] and f1['y1'] < f2[
                        'y2'],  # No face is entirely above another
                    same_height(MAX_FACE_HEIGHT_DIFF),
                    lambda f1, f2:
                    (f1['x2'] - f2['x1']) / max(f1['x2'] - f1['x1'], f2[
                        'x2'] - f2['x1']) < MAX_FACE_OVERLAP_X_FRACTION
                ]
            },
        ]
    }

    def mouths_are_close(lm1, lm2):
        select_outer = [2, 3, 4, 8, 9, 10]
        select_inner = [1, 2, 3, 5, 6, 7]
        mouth1 = np.concatenate(
            (lm1.outer_lips()[select_outer], lm1.inner_lips()[select_inner]))
        mouth2 = np.concatenate(
            (lm2.outer_lips()[select_outer], lm2.inner_lips()[select_inner]))
        mean1 = np.mean(mouth1, axis=0)
        mean2 = np.mean(mouth2, axis=0)
        return np.linalg.norm(mean1 - mean2) <= MAX_MOUTH_DIFF

    # Face is profile if both eyes are on the same side of the nose bridge horizontally.
    def is_left_profile(f):
        lm = f['landmarks']
        nose_x = min(lm.nose_bridge()[:, 0])
        left = np.all(lm.left_eye()[:, 0] >= nose_x)
        right = np.all(lm.right_eye()[:, 0] >= nose_x)
        return left and right

    def is_right_profile(f):
        lm = f['landmarks']
        nose_x = max(lm.nose_bridge()[:, 0])
        left = np.all(lm.left_eye()[:, 0] <= nose_x)
        right = np.all(lm.right_eye()[:, 0] <= nose_x)
        return left and right

    # Line is ax+by+c=0
    def project_point_to_line(pt, a, b, c):
        x0, y0 = pt[0], pt[1]
        d = a * a + b * b
        x = (b * (b * x0 - a * y0) - a * c) / d
        y = (a * (-b * x0 + a * y0) - b * c) / d
        return np.array([x, y])

    # Positive if facing right
    def signed_face_angle(lm):
        center_line_indices = [27, 28, 32, 33, 34, 51, 62, 66, 57]
        data = lm.landmarks[center_line_indices]
        fit = np.polyfit(data[:, 0], data[:, 1], 1)
        # y = ax+b
        a, b = fit[0], fit[1]
        A = project_point_to_line(lm.landmarks[center_line_indices[0]], a, -1,
                                  b)
        B = project_point_to_line(lm.landmarks[center_line_indices[-1]], a, -1,
                                  b)
        AB = B - A
        AB = AB / np.linalg.norm(AB)
        C = np.mean(lm.nose_bridge()[2:4], axis=0)
        AC = C - A
        AC = AC / np.linalg.norm(AC)
        return np.cross(AB, AC)

    graph2 = {
        'nodes': [
            {
                'name':
                'left',
                'predicates': [
                    lambda f: signed_face_angle(f['landmarks']) >
                    MIN_FACE_ANGLE
                    #                 is_right_profile
                ]
            },
            {
                'name':
                'right',
                'predicates': [
                    lambda f: signed_face_angle(f['landmarks']) <
                    -MIN_FACE_ANGLE
                    #                 is_left_profile
                ]
            },
        ],
        'edges': [{
            'start':
            'left',
            'end':
            'right',
            'predicates': [
                lambda l, r: mouths_are_close(l['landmarks'], r['landmarks']),
            ]
        }]
    }

    mf_up_close = faces.filter(
        payload_satisfies(scene_graph(graph, exact=True))).map(
            map_payload(get_landmarks)).filter(
                payload_satisfies(scene_graph(graph2, exact=True)))
    vids = mf_up_close.get_allintervals().keys()
    # Merge with shots
    shots_qs = Shot.objects.filter(
        video_id__in=vids,
        labeler=Labeler.objects.get(name='shot-hsvhist-face')).all()
    total = shots_qs.count()
    print("Total shots:", total)
    # use emtpy list as payload
    shots = VideoIntervalCollection.from_django_qs(shots_qs,
                                                   with_payload=lambda row: [],
                                                   progress=True,
                                                   total=total)
    kissing_shots = mf_up_close.join(shots,
                                     lambda kiss, shot: [(kiss.get_start(
                                     ), shot.get_end(), kiss.get_payload())],
                                     predicate=overlaps(),
                                     working_window=1).coalesce()

    # Getting faces in the shot
    def wrap_in_list(intvl):
        intvl.payload = [intvl.payload]
        return intvl

    print("Getting faces...")
    faces_qs2 = Face.objects.filter(frame__video_id__in=vids,
                                    probability__gte=MIN_FACE_CONFIDENCE)
    total = faces_qs2.count()
    faces2 = VideoIntervalCollection.from_django_qs(
        faces_qs2.annotate(min_frame=F('frame__number'),
                           max_frame=F('frame__number'),
                           video_id=F('frame__video_id')),
        with_payload=in_array(
            merge_dict_parsers([
                bbox_payload_parser(VideoIntervalCollection.django_accessor),
                dict_payload_parser(VideoIntervalCollection.django_accessor,
                                    {'frame': 'min_frame'})
            ])),
        progress=True,
        total=total).coalesce(payload_merge_op=payload_plus).map(wrap_in_list)

    def clip_to_last_frame_with_two_faces(intvl):
        faces = intvl.get_payload()[1]
        two_faces = [(f[0], f[1]) for f in faces if len(f) == 2]
        two_high_faces = [
            (a, b) for a, b in two_faces
            if min(a['y2'] - a['y1'], b['y2'] - b['y1']) >= MIN_FACE_HEIGHT
        ]
        frame = [a['frame'] for a, b in two_high_faces]

        if len(frame) > 0:
            intvl.end = frame[-1]
        return intvl

    clipped_kissing_shots = kissing_shots.merge(
        faces2,
        payload_merge_op=lambda p1, p2: (p1, p2),
        predicate=overlaps(),
        working_window=1).coalesce(
            payload_merge_op=lambda p1, p2: (p1[0], p1[1] + p2[1])).map(
                clip_to_last_frame_with_two_faces).filter_length(min_length=12)

    results = get_all_segments(vids)
    fps_map = dict((i, Video.objects.get(id=i).fps) for i in vids)
    caption_results = VideoIntervalCollection({
        video_id: [
            (
                word[0] * fps_map[video_id],  # start frame
                word[1] * fps_map[video_id],  # end frame
                word[2])  # payload is the word
            for word in words
        ]
        for video_id, words in results
    })
    kissing_without_words = clipped_kissing_shots.minus(caption_results)
    kissing_final = kissing_without_words.map(lambda intvl: (int(
        intvl.start), int(intvl.end), intvl.payload)).coalesce().filter_length(
            min_length=12)

    def payload_to_objects(p, video_id):
        return [face_landmarks_to_dict(face['landmarks']) for face in p[0]
                ] + [bbox_to_result_object(face, video_id) for face in p[0]]

    return intrvllists_to_result_with_objects(
        kissing_final.get_allintervals(),
        lambda p, vid: payload_to_objects(p, vid),
        stride=1)
예제 #6
0
def conversations_for_display():
    from query.models import FaceCharacterActor, Shot
    from rekall.video_interval_collection import VideoIntervalCollection
    from rekall.parsers import in_array, bbox_payload_parser, merge_dict_parsers, dict_payload_parser
    from rekall.merge_ops import payload_plus
    from rekall.payload_predicates import payload_satisfies
    from rekall.spatial_predicates import scene_graph
    from esper.rekall import intrvllists_to_result_bbox
    from query.models import Face
    from rekall.video_interval_collection import VideoIntervalCollection
    from rekall.parsers import in_array, bbox_payload_parser
    from rekall.merge_ops import payload_plus, merge_named_payload, payload_second
    from esper.rekall import intrvllists_to_result_bbox
    from rekall.payload_predicates import payload_satisfies
    from rekall.list_predicates import length_at_most
    from rekall.logical_predicates import and_pred, or_pred, true_pred
    from rekall.spatial_predicates import scene_graph, make_region
    from rekall.temporal_predicates import before, after, overlaps, equal
    from rekall.bbox_predicates import height_at_least
    from esper.rekall import intrvllists_to_result, intrvllists_to_result_with_objects, add_intrvllists_to_result
    from esper.prelude import esper_widget
    from rekall.interval_list import Interval, IntervalList
    import esper.face_embeddings as face_embeddings

    video_id = 15
    EMBEDDING_EQUALITY_THRESHOLD = 1.
    ONE_FRAME = 1

    faces_qs = Face.objects.annotate(min_frame=F('frame__number'),
                                     max_frame=F('frame__number'),
                                     video_id=F('frame__video_id')).filter(
                                         frame__video_id=video_id,
                                         frame__regularly_sampled=True)

    faces_per_frame = VideoIntervalCollection.from_django_qs(
        faces_qs,
        with_payload=in_array(
            merge_dict_parsers([
                bbox_payload_parser(VideoIntervalCollection.django_accessor),
                dict_payload_parser(VideoIntervalCollection.django_accessor,
                                    {'face_id': 'id'}),
            ]))).coalesce(payload_merge_op=payload_plus)

    shots_qs = Shot.objects.filter(cinematic=True)
    shots = VideoIntervalCollection.from_django_qs(shots_qs)

    shots_with_faces = shots.merge(
        faces_per_frame,
        predicate=overlaps(),
        payload_merge_op=lambda shot_id, faces_in_frame:
        (shot_id, [faces_in_frame])).coalesce(
            payload_merge_op=lambda p1, p2: (p1[0], p1[1] + p2[1]))

    def cluster_center(face_ids):
        #         print("About to compute mean")
        mean_embedding = face_embeddings.mean(face_ids)
        #         print("About to compute dist", face_ids)
        dists = face_embeddings.dist(face_ids, [mean_embedding])
        #         print("Done computing dist")
        return min(zip(dists, face_ids))[1]

    def cluster_and_compute_centers(faces_in_frame_list, shot_id):
        num_people = max(
            len(faces_in_frame) for faces_in_frame in faces_in_frame_list)
        face_ids = [
            face['face_id'] for faces_in_frame in faces_in_frame_list
            for face in faces_in_frame
        ]
        face_heights = [
            face['y2'] - face['y1'] for faces_in_frame in faces_in_frame_list
            for face in faces_in_frame
        ]
        print(num_people)
        if num_people == 1:
            clusters = [(fid, 0) for fid in face_ids]
        else:
            clusters = face_embeddings.kmeans(face_ids, num_people)
#         print("Done clustering")
        centers = [(cluster_center([
            face_id for face_id, cluster_id in clusters if cluster_id == i
        ]), [face_id for face_id, cluster_id in clusters
             if cluster_id == i], shot_id,
                    max([
                        face_heights[face_ids.index(face_id)]
                        for face_id, cluster_id in clusters if cluster_id == i
                    ])) for i in range(num_people)]
        #         print("Done computing the center")
        return centers

#     print("About to compute clusters")

    shots_with_centers = shots_with_faces.map(lambda intrvl: (
        intrvl.start, intrvl.end,
        (intrvl.payload[0],
         cluster_and_compute_centers(intrvl.payload[1], intrvl.payload[0]))))

    #     print("Clusters computed")

    def same_face(center1, center2):
        return face_embeddings.dist(
            [center1], target_ids=[center2])[0] < EMBEDDING_EQUALITY_THRESHOLD

    def cross_product_faces(intrvl1, intrvl2):
        payload1 = intrvl1.get_payload()
        payload2 = intrvl2.get_payload()
        payload = []
        for cluster1 in payload1[1]:
            for cluster2 in payload2[1]:
                if not same_face(cluster1[0], cluster2[0]):
                    new_payload = {'A': cluster1, 'B': cluster2}
                    payload.append(new_payload)

        return [(min(intrvl1.get_start(), intrvl2.get_start()),
                 max(intrvl1.get_end(), intrvl2.get_end()), {
                     'chrs': payload,
                     'shots': [payload1[0], payload2[0]]
                 })]

    two_shots = shots_with_centers.join(shots_with_centers,
                                        predicate=after(max_dist=ONE_FRAME,
                                                        min_dist=ONE_FRAME),
                                        merge_op=cross_product_faces)

    #     print("Cross product done")

    def faces_equal(payload1, payload2):
        for face_pair1 in payload1['chrs']:
            for face_pair2 in payload2['chrs']:
                if (same_face(face_pair1['A'][0], face_pair2['A'][0])
                        and same_face(face_pair1['B'][0], face_pair2['B'][0])):
                    return True
                if (same_face(face_pair1['A'][0], face_pair2['B'][0])
                        and same_face(face_pair1['B'][0], face_pair2['A'][0])):
                    return True
        return False

    convs = two_shots.coalesce(
        predicate=payload_satisfies(faces_equal, arity=2),
        payload_merge_op=lambda payload1, payload2: {
            'chrs': payload1['chrs'] + payload2['chrs'],
            'shots': payload1['shots'] + payload2['shots']
        })

    #     print("Coalesce done")

    adjacent_seq = convs.merge(
        convs,
        predicate=and_pred(after(max_dist=ONE_FRAME, min_dist=ONE_FRAME),
                           payload_satisfies(faces_equal, arity=2),
                           arity=2),
        payload_merge_op=lambda payload1, payload2: {
            'chrs': payload1['chrs'] + payload2['chrs'],
            'shots': payload1['shots'] + payload2['shots']
        },
        working_window=1)
    convs = convs.set_union(adjacent_seq)

    # convs = convs.coalesce(predicate=times_equal, payload_merge_op=shots_equal)

    #     print("Two-shot adjacencies done")

    def filter_fn(intvl):
        payload = intvl.get_payload()
        if type(payload) is dict and 'shots' in payload:
            return len(set(payload['shots'])) >= 3
        return False

    convs = convs.filter(filter_fn)
    convs = convs.coalesce()

    #     print("Final filter done")

    #     for video_id in convs.intervals.keys():
    #         print(video_id)
    #         intvllist = convs.get_intervallist(video_id)
    #         for intvl in intvllist.get_intervals():
    #             print(intvl.payload)
    #             print(str(intvl.start) + ':' + str(intvl.end))

    return intervallists_to_result_with_objects(convs, lambda a, b: [])
예제 #7
0
def reaction_shots_apollo_13():
    from rekall.video_interval_collection import VideoIntervalCollection
    from rekall.merge_ops import payload_plus
    from rekall.payload_predicates import payload_satisfies
    from rekall.temporal_predicates import overlaps
    from rekall.parsers import in_array, merge_dict_parsers, bbox_payload_parser, dict_payload_parser
    from esper.caption_metadata import caption_metadata_for_video
    from esper.captions import get_all_segments
    from esper.rekall import intrvllists_to_result_with_objects
    from query.models import FaceCharacterActor, Shot

    videos = Video.objects.filter(name__contains="apollo 13").all()

    # Load script data
    metadata = VideoIntervalCollection({
        video.id: caption_metadata_for_video(video.id)
        for video in videos
    }).filter(lambda meta_interval:
              (meta_interval.payload['speaker'] is not None and "man's voice"
               not in meta_interval.payload['speaker'] and meta_interval.
               payload['speaker'].strip() != "gene krantz"))

    all_segments = get_all_segments([video.id for video in videos])

    captions_interval_collection = VideoIntervalCollection(
        {video: intervals
         for video, intervals in all_segments})

    captions_with_speaker_id = captions_interval_collection.overlaps(
        metadata.filter(payload_satisfies(lambda p: p['aligned'])),
        payload_merge_op=lambda word, script_meta:
        (word[0], script_meta['speaker']))

    # Annotate face rows with start and end frames and the video ID
    faces_with_character_actor_qs = FaceCharacterActor.objects.annotate(
        min_frame=F('face__frame__number'),
        max_frame=F('face__frame__number'),
        video_id=F('face__frame__video_id'),
        character_name=F('characteractor__character__name')).filter(
            video_id__in=[v.id for v in videos])

    frames_with_identity = VideoIntervalCollection.from_django_qs(
        faces_with_character_actor_qs,
        with_payload=in_array(
            dict_payload_parser(VideoIntervalCollection.django_accessor,
                                {'character': 'character_name'}), )).coalesce(
                                    payload_merge_op=payload_plus)

    # Annotate shots with all the people in them
    shots_qs = Shot.objects.filter(
        cinematic=True,
        video_id__in=[v.id for v in videos]).annotate(fps=F('video__fps'))
    shots = VideoIntervalCollection.from_django_qs(
        shots_qs, with_payload=lambda shot: shot.fps)

    # Annotate shots with mode shot scale
    frames_with_shot_scale_qs = Frame.objects.filter(
        regularly_sampled=True,
        video_id__in=[v.id for v in videos
                      ]).annotate(min_frame=F('number'),
                                  max_frame=F('number'),
                                  shot_scale_name=F('shot_scale__name')).all()
    frames_with_shot_scale = VideoIntervalCollection.from_django_qs(
        frames_with_shot_scale_qs, with_payload=lambda f: f.shot_scale_name)

    def get_mode(items):
        return max(set(items), key=items.count)

    shots_with_scale = shots.merge(
        frames_with_shot_scale,
        predicate=overlaps(),
        payload_merge_op=lambda shot_fps, shot_scale: [(shot_fps, shot_scale)]
    ).coalesce(payload_merge_op=payload_plus).map(
        lambda intrvl: (intrvl.start, intrvl.end, {
            'fps': intrvl.payload[0][0],
            'shot_scale': get_mode([p[1] for p in intrvl.payload])
        }))

    shots_with_people_in_them = shots_with_scale.overlaps(
        frames_with_identity,
        payload_merge_op=lambda shot_payload, identities:
        (shot_payload, identities),
        working_window=1).coalesce(payload_merge_op=lambda p1, p2: (p1[0], p1[
            1] + p2[1])).map(lambda intrvl: (intrvl.start / intrvl.payload[0][
                'fps'], intrvl.end / intrvl.payload[0]['fps'], {
                    'fps':
                    intrvl.payload[0]['fps'],
                    'shot_scale':
                    intrvl.payload[0]['shot_scale'],
                    'characters':
                    set([
                        name.strip().split(' ')[0].strip() for d in intrvl.
                        payload[1] for name in d['character'].split('/')
                        if len(name.strip()) > 0
                    ])
                }))

    reaction_shots = captions_with_speaker_id.overlaps(
        shots_with_people_in_them.filter(
            payload_satisfies(
                lambda p: p['shot_scale'] in
                ['medium_close_up', 'close_up', 'extreme_close_up'])),
        predicate=lambda captions, shots: captions.payload[1].strip().split(
            ' ')[0] not in shots.payload['characters'],
        payload_merge_op=lambda word_and_speaker, fps_and_characters:
        (fps_and_characters['fps'], word_and_speaker)).map(lambda intrvl: (
            int(intrvl.start * intrvl.payload[0]),
            int(intrvl.end * intrvl.payload[0]), [intrvl.payload[1]])).dilate(
                12).coalesce(
                    payload_merge_op=payload_plus).dilate(-12).filter_length(
                        min_length=12)

    return intrvllists_to_result_with_objects(reaction_shots, lambda a, b: [])
예제 #8
0
def get_shots_for_video(vid):

    shots_qs = Shot.objects.filter(
        video__id=vid,
        labeler=Labeler.objects.get(name='shot-hsvhist-face')
    ).all()
    total = shots_qs.count()
    print("Total shots:", total)
    shots = VideoIntervalCollection.from_django_qs(
        shots_qs,
        with_payload=lambda row:[],
        progress=False,
        total=total
    )
    
    # Take all sampled frames: every 12th.
    frames_qs = Frame.objects.filter(video__id=vid).annotate(numbermod=F('number') % 12).filter(
            numbermod=0).annotate(scale=F("shot_scale__name"))
    total = frames_qs.count()
    print("Total frames with scale:", total)
    shot_scales = VideoIntervalCollection.from_django_qs(
        frames_qs,
        schema={
            "start": "number",
            "end": "number",
        },
        with_payload=lambda f: [ShotScaleEnum[f.scale.upper()]],
        progress=False, total=total)
    
    # Take all poses
    poses_qs = PoseMeta.objects.filter(frame__video__id=vid).annotate(
        min_frame=F('frame__number'),
        max_frame=F('frame__number'),
        video_id=F('frame__video_id')
    )
    total = poses_qs.count()
    print("Total Poses:", total)
    poses = VideoIntervalCollection.from_django_qs(
        poses_qs,
        with_payload=lambda row: [row],
        progress=False,
        total=total
    ).coalesce(payload_merge_op=payload_plus)
    
    print("Merging scales into shots")
    # Merge scales into shots
    shots_with_scale = shots.merge(
        shot_scales,
        payload_merge_op = payload_second,
        predicate=overlaps(),
        working_window=1
    ).coalesce(
        payload_merge_op=payload_plus
    ).map(
        lambda shot_interval: (shot_interval.get_start(), shot_interval.get_end(),
                              {"scale": scale_for_shot(shot_interval.get_payload())})
    )
    
    print("Merging poses into shots")
    # Merge poses into shots
    shots_with_poses = shots.merge(
        poses.map(lambda shot_interval: (shot_interval.get_start(), shot_interval.get_end(), [shot_interval.get_payload()])),
        payload_merge_op = payload_second,
        predicate=overlaps(),
        working_window=1
    ).coalesce(
        # Get a list of list of poses for each shot
        payload_merge_op = payload_plus
    ).map(lambda shot_interval: (shot_interval.get_start(), shot_interval.get_end(),
                                 {"poses": poses_for_shot(shot_interval.get_payload())}))
    
    print("Computing shot features")
    # Get shots with shot features
    shots = shots_with_scale.merge(
        shots_with_poses,
        payload_merge_op = lambda d1, d2: {**d1,**d2},
        predicate=overlaps(),
        working_window=1
    ).coalesce().map(
        lambda intv: (intv.get_start(), intv.get_end(), ShotFeatures(
            intv.get_payload()["scale"], intv.get_payload()["poses"])))
    return shots
예제 #9
0
def detect_commercial_rekall(video, transcript_path, blackframe_list=None, histogram=None, debug=True, verbose=False):
    """
    API for detecting commercial blocks from TV news video using rekall
    
    @video: django query set
    @transcript_path: transcript_path
    @blackframe_list: list of black frames index
    @histogram: list of histogram 16x3 bin for each frame, not used if blackframe_list is provided  
    
    Return: commercial_list (list of tuple((start_fid, start_sec), (end_fid, end_sec)), None if failed)
    """
    if type(video) == dict:
        fps = video['fps']
        video_length = video['num_frames'] / fps
    else:
        fps = video.fps
        video_length = video.num_frames / video.fps
    
    transcript = load_transcript(transcript_path)
    if blackframe_list is None:
        blackframe_intervallist = get_blackframe_list(histogram)
    else:
        blackframe_intervallist = IntervalList([(fid2second(fid, fps),
                                                fid2second(fid + 1, fps),
                                                0) for fid in blackframe_list])
    
    # get black windows
    black_windows = blackframe_intervallist \
            .dilate(1. / fps) \
            .coalesce() \
            .dilate(-1. / fps)
#             .filter_length(min_length=MIN_BLACKWINDOW * 1. / video.fps)
    if verbose:
        print("black window: ({})\n".format(black_windows.size()))
        for idx, win in enumerate(black_windows.get_intervals()):
            print(idx, win)
    
    # get all instances of >>
    arrow_intervals = get_text_intervals(">>", transcript)
    arrow_announcer_intervals = get_text_intervals(">> Announcer:", transcript)
    arrow_having_intervals = get_text_intervals(">> HAVING", transcript)
    if verbose:
        print("arrow_text: ({})\n".format(arrow_intervals.size()), arrow_intervals)
        print("arrow_announcer_text: ({})\n".format(arrow_announcer_intervals.size()), arrow_announcer_intervals)
    
    # get intervals for the whole transcript
    transcript_intervals = IntervalList([
        (start_sec, end_sec, 0)
        for text, start_sec, end_sec in transcript
        if not '{' in text
    ]).dilate(1)  \
      .coalesce() \
      .dilate(-1) \
    
    # get an interval for the whole video
    whole_video = IntervalList([(0., video_length, 0)])

    # whole video minus black windows to get segments in between black windows
    # then filter out anything that overlaps with ">>" as long as it's not ">> Announcer:"
    # then coalesce, as long as it doesn't get too long
    def fold_fn(stack, interval):
        if interval.length() > MAX_COMMERCIAL_TIME:
            interval = Interval(interval.start, interval.start + MAX_COMMERCIAL_TIME, interval.payload)
        if len(stack) == 0:
            stack.append(interval)
        else:
            last = stack.pop()
            if or_pred(overlaps(), after(max_dist=5), arity=2)(interval, last):
                if last.merge(interval).length() > MAX_COMMERCIAL_TIME:
                    stack.append(Interval(
                        last.start, 
                        last.start + MAX_COMMERCIAL_TIME, 
                        last.payload))
                else:
                    stack.append(last.merge(interval))
            else:
                stack.append(last)
                stack.append(interval)
        return stack
    
    # get reliable double arrow intervals
    reliable_transcripts = transcript_intervals.filter_length(min_length=RELIABLE_TEXT_DURATION)
    arrow_intervals = arrow_intervals \
        .minus(arrow_announcer_intervals) \
        .minus(arrow_having_intervals) \
        .filter_against(
            reliable_transcripts,
            predicate=overlaps()   
        )
    
    # get non-commercial blocks by filtering out intervals overlaps with >>
    all_blocks = whole_video.minus(black_windows)
    non_commercial_blocks = all_blocks.filter_against(
        arrow_intervals,
        predicate=overlaps()
    )
    
    commercial_blocks = whole_video.minus(non_commercial_blocks.set_union(black_windows))
    if verbose:
        print("commercial blocks candidates: ({})\n".format(commercial_blocks.size()))
        for idx, win in enumerate(commercial_blocks.get_intervals()):
            print(idx, win)
    
    commercials = commercial_blocks \
        .fold_list(fold_fn, []) \
        .filter_length(min_length = MIN_COMMERCIAL_TIME)
    commercials_raw = copy.deepcopy(commercials)
    if verbose:
        print("commercials from blackwindow:\n", commercials)
    
    
    # add in lowercase intervals
    lowercase_intervals = get_lowercase_intervals(transcript)
    if verbose:
        print("lowercase intervals:\n", lowercase_intervals)
    commercials = commercials.set_union(lowercase_intervals) 
    if verbose:
        print("commercials merge with lowercase:\n", commercials)
    
    
    # get blank intervals
    blank_intervals = whole_video \
        .minus(transcript_intervals) \
        .filter_length(min_length=MIN_BLANKWINDOW, max_length=MAX_BLANKWINDOW)
    # remove last one minute segment due to no aligned transcripts
    blank_intervals = blank_intervals \
        .minus(IntervalList([(video_length-60, video_length, 0)])) \
        .filter_length(min_length=MIN_BLANKWINDOW)
    if verbose:
        print("blank intervals:\n", blank_intervals)

    # add in blank intervals
    commercials = commercials.set_union(blank_intervals) 
        
#     commercials = commercials.merge(blank_intervals,
#             predicate=or_pred(before(max_dist=MAX_MERGE_GAP),
#                 after(max_dist=MAX_MERGE_GAP), arity=2),
#             working_window=MAX_MERGE_GAP
#             ) \
#             .filter_length(max_length=MAX_MERGE_DURATION) \
#             .set_union(commercials) \
#             .dilate(MIN_COMMERCIAL_GAP / 2) \
#             .coalesce() \
#             .dilate(-MIN_COMMERCIAL_GAP / 2)
    if verbose:
        print("commercials merge with blank intervals:\n", commercials)
        
        
    # merge with small gaps, but only if that doesn't make things too long
    commercials = commercials \
            .dilate(MAX_MERGE_GAP / 2) \
            .coalesce() \
            .dilate(-MAX_MERGE_GAP / 2) \
            .filter_length(max_length=MAX_COMMERCIAL_TIME) \
            .set_union(commercials_raw) \
            .set_union(lowercase_intervals) \
            .set_union(blank_intervals) \
            .coalesce()
    

#     # post-process commercials to get rid of gaps, small commercials, and
#     #   islated blocks
#     small_gaps = whole_video \
#             .minus(commercials) \
#             .filter_length(max_length = MAX_COMMERCIAL_GAP) \
#             .filter_against(
#                     arrow_text.filter_against(
#                         announcer_text,
#                         predicate=not_pred(overlaps()),
#                         working_window=1.0
#                     ), predicate=not_pred(overlaps()),
#                     working_window=1.0)
    
#     # merge with small gaps, but only if that doesn't make things too long
#     commercials = commercials \
#             .set_union(small_gaps.dilate(0.1)) \
#             .coalesce() \
#             .filter_length(max_length=MAX_COMMERCIAL_TIME) \
#             .set_union(commercials) \
#             .coalesce()

#     # get isolated commercials
#     not_isolated_commercials = commercials.filter_against(commercials,
#             predicate=or_pred(before(max_dist=MAX_COMMERCIAL_TIME),
#                 after(max_dist=MAX_COMMERCIAL_TIME), arity=2),
#             working_window=MAX_COMMERCIAL_TIME)
#     isolated_commercials = commercials.minus(not_isolated_commercials)
#     commercials_to_delete = isolated_commercials \
#             .filter_length(max_length=MIN_COMMERCIAL_TIME_FINAL) \
#             .set_union(isolated_commercials \
#                 .filter_against(blank_intervals, predicate=equal()) \
#                 .filter_length(max_length=MAX_ISOLATED_BLANK_TIME))
#     commercials = commercials.minus(commercials_to_delete)

    if debug:
        result = {'black': black_windows.dilate(2),
                  'arrow': arrow_intervals.dilate(2),
                  'commercials_raw': commercials_raw,
                  'lowercase': lowercase_intervals,
                  'blank': blank_intervals,
                  'commercials': commercials,
                  }
        return result
    else:
        result = [(i.start, i.end) for i in commercials.get_intervals()]
        return result