def test_annotate_video(self): # Setup Expected Response expected_response = {} expected_response = video_intelligence_pb2.AnnotateVideoResponse( **expected_response) operation = operations_pb2.Operation( name='operations/test_annotate_video', done=True) operation.response.Pack(expected_response) # Mock the API response channel = ChannelStub(responses=[operation]) client = videointelligence_v1p1beta1.VideoIntelligenceServiceClient( channel=channel) # Setup Request input_uri = 'gs://demomaker/cat.mp4' features_element = enums.Feature.LABEL_DETECTION features = [features_element] response = client.annotate_video( input_uri=input_uri, features=features) result = response.result() assert expected_response == result assert len(channel.requests) == 1 expected_request = video_intelligence_pb2.AnnotateVideoRequest( input_uri=input_uri, features=features) actual_request = channel.requests[0][1] assert expected_request == actual_request
def analyze_all(path): video_client = videointelligence.VideoIntelligenceServiceClient() features = [ videointelligence.enums.Feature.SPEECH_TRANSCRIPTION, videointelligence.enums.Feature.EXPLICIT_CONTENT_DETECTION, videointelligence.enums.Feature.LABEL_DETECTION, videointelligence.enums.Feature.SHOT_CHANGE_DETECTION ] config = videointelligence.types.SpeechTranscriptionConfig( language_code='en-US', enable_automatic_punctuation=True) video_context = videointelligence.types.VideoContext( speech_transcription_config=config) operation = video_client.annotate_video(path, features=features, video_context=video_context) print('\nProcessing video for all features') result = operation.result(timeout=600) # There is only one annotation_result since only # one video is processed. with open('C:\\temp\\cloud-intelligence-results-otezla.json', 'wb') as out: for ar in result.annotation_results: out.write(str(ar))
def test_annotate_video(self): # Setup Expected Response expected_response = {} expected_response = video_intelligence_pb2.AnnotateVideoResponse( **expected_response) operation = operations_pb2.Operation( name="operations/test_annotate_video", done=True) operation.response.Pack(expected_response) # Mock the API response channel = ChannelStub(responses=[operation]) patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = videointelligence_v1p1beta1.VideoIntelligenceServiceClient( ) # Setup Request features_element = enums.Feature.LABEL_DETECTION features = [features_element] input_uri = "gs://cloud-samples-data/video/cat.mp4" response = client.annotate_video(features, input_uri=input_uri) result = response.result() assert expected_response == result assert len(channel.requests) == 1 expected_request = video_intelligence_pb2.AnnotateVideoRequest( features=features, input_uri=input_uri) actual_request = channel.requests[0][1] assert expected_request == actual_request
def speech_transcription(input_uri): """Transcribe speech from a video stored on GCS.""" video_client = videointelligence.VideoIntelligenceServiceClient() features = [videointelligence.enums.Feature.SPEECH_TRANSCRIPTION] config = videointelligence.types.SpeechTranscriptionConfig( language_code='en-US', enable_automatic_punctuation=True) video_context = videointelligence.types.VideoContext( speech_transcription_config=config) operation = video_client.annotate_video(input_uri, features=features, video_context=video_context) print('\nProcessing video for speech transcription.') result = operation.result(timeout=300) # There is only one annotation_result since only # one video is processed. annotation_results = result.annotation_results[0] speech_transcription = annotation_results.speech_transcriptions[0] alternative = speech_transcription.alternatives[0] return alternative
def track_objects(path): # [START video_object_tracking_beta] """Object Tracking.""" from google.cloud import videointelligence_v1p2beta1 as videointelligence video_client = videointelligence.VideoIntelligenceServiceClient() features = [videointelligence.Feature.OBJECT_TRACKING] with io.open(path, "rb") as file: input_content = file.read() # It is recommended to use location_id as 'us-east1' for the best latency # due to different types of processors used in this region and others. operation = video_client.annotate_video( request={ "features": features, "input_content": input_content, "location_id": "us-east1", }) print("\nProcessing video for object annotations.") result = operation.result(timeout=500) print("\nFinished processing.\n") # The first result is retrieved because a single video was processed. object_annotations = result.annotation_results[0].object_annotations # Get only the first annotation for demo purposes. object_annotation = object_annotations[0] # description is in Unicode print(u"Entity description: {}".format( object_annotation.entity.description)) if object_annotation.entity.entity_id: print("Entity id: {}".format(object_annotation.entity.entity_id)) print("Segment: {}s to {}s".format( object_annotation.segment.start_time_offset.seconds + object_annotation.segment.start_time_offset.microseconds / 1e6, object_annotation.segment.end_time_offset.seconds + object_annotation.segment.end_time_offset.microseconds / 1e6, )) print("Confidence: {}".format(object_annotation.confidence)) # Here we print only the bounding box of the first frame in this segment frame = object_annotation.frames[0] box = frame.normalized_bounding_box print("Time offset of the first frame: {}s".format( frame.time_offset.seconds + frame.time_offset.microseconds / 1e6)) print("Bounding box position:") print("\tleft : {}".format(box.left)) print("\ttop : {}".format(box.top)) print("\tright : {}".format(box.right)) print("\tbottom: {}".format(box.bottom)) print("\n") # [END video_object_tracking_beta] return object_annotations
def track_objects(path): # [START video_object_tracking_beta] """Object Tracking.""" from google.cloud import videointelligence_v1p2beta1 as videointelligence video_client = videointelligence.VideoIntelligenceServiceClient() features = [videointelligence.enums.Feature.OBJECT_TRACKING] with io.open(path, 'rb') as file: input_content = file.read() # It is recommended to use location_id as 'us-east1' for the best latency # due to different types of processors used in this region and others. operation = video_client.annotate_video(input_content=input_content, features=features, location_id='us-east1') print('\nProcessing video for object annotations.') result = operation.result(timeout=300) print('\nFinished processing.\n') # The first result is retrieved because a single video was processed. object_annotations = result.annotation_results[0].object_annotations # Get only the first annotation for demo purposes. object_annotation = object_annotations[0] print('Entity description: {}'.format( object_annotation.entity.description)) if object_annotation.entity.entity_id: print('Entity id: {}'.format(object_annotation.entity.entity_id)) print('Segment: {}s to {}s'.format( object_annotation.segment.start_time_offset.seconds + object_annotation.segment.start_time_offset.nanos / 1e9, object_annotation.segment.end_time_offset.seconds + object_annotation.segment.end_time_offset.nanos / 1e9)) print('Confidence: {}'.format(object_annotation.confidence)) # Here we print only the bounding box of the first frame in this segment frame = object_annotation.frames[0] box = frame.normalized_bounding_box print('Time offset of the first frame: {}s'.format( frame.time_offset.seconds + frame.time_offset.nanos / 1e9)) print('Bounding box position:') print('\tleft : {}'.format(box.left)) print('\ttop : {}'.format(box.top)) print('\tright : {}'.format(box.right)) print('\tbottom: {}'.format(box.bottom)) print('\n') # [END video_object_tracking_beta] return object_annotations
def video_detect_text(path): # [START video_detect_text_beta] """Detect text in a local video.""" from google.cloud import videointelligence_v1p2beta1 as videointelligence video_client = videointelligence.VideoIntelligenceServiceClient() features = [videointelligence.Feature.TEXT_DETECTION] video_context = videointelligence.VideoContext() with io.open(path, "rb") as file: input_content = file.read() operation = video_client.annotate_video( request={ "features": features, "input_content": input_content, "video_context": video_context, }) print("\nProcessing video for text detection.") result = operation.result(timeout=300) # The first result is retrieved because a single video was processed. annotation_result = result.annotation_results[0] # Get only the first result text_annotation = annotation_result.text_annotations[0] print("\nText: {}".format(text_annotation.text)) # Get the first text segment text_segment = text_annotation.segments[0] start_time = text_segment.segment.start_time_offset end_time = text_segment.segment.end_time_offset print("start_time: {}, end_time: {}".format( start_time.seconds + start_time.microseconds * 1e-6, end_time.seconds + end_time.microseconds * 1e-6, )) print("Confidence: {}".format(text_segment.confidence)) # Show the result for the first frame in this segment. frame = text_segment.frames[0] time_offset = frame.time_offset print( "Time offset for the first frame: {}".format(time_offset.seconds + time_offset.microseconds * 1e-6)) print("Rotated Bounding Box Vertices:") for vertex in frame.rotated_bounding_box.vertices: print("\tVertex.x: {}, Vertex.y: {}".format(vertex.x, vertex.y)) # [END video_detect_text_beta] return annotation_result.text_annotations
def speech_transcription(input_uri, timeout=180): # [START video_speech_transcription_gcs_beta] """Transcribe speech from a video stored on GCS.""" from google.cloud import videointelligence_v1p1beta1 as videointelligence video_client = videointelligence.VideoIntelligenceServiceClient() features = [videointelligence.Feature.SPEECH_TRANSCRIPTION] config = videointelligence.SpeechTranscriptionConfig( language_code="en-US", enable_automatic_punctuation=True) video_context = videointelligence.VideoContext( speech_transcription_config=config) operation = video_client.annotate_video( request={ "features": features, "input_uri": input_uri, "video_context": video_context, }) print("\nProcessing video for speech transcription.") result = operation.result(timeout) # There is only one annotation_result since only # one video is processed. annotation_results = result.annotation_results[0] for speech_transcription in annotation_results.speech_transcriptions: # The number of alternatives for each transcription is limited by # SpeechTranscriptionConfig.max_alternatives. # Each alternative is a different possible transcription # and has its own confidence score. for alternative in speech_transcription.alternatives: print("Alternative level information:") print("Transcript: {}".format(alternative.transcript)) print("Confidence: {}\n".format(alternative.confidence)) print("Word level information:") for word_info in alternative.words: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time print("\t{}s - {}s: {}".format( start_time.seconds + start_time.microseconds * 1e-6, end_time.seconds + end_time.microseconds * 1e-6, word, ))
def video_detect_text(path): # [START video_detect_text_beta] """Detect text in a local video.""" from google.cloud import videointelligence_v1p2beta1 as videointelligence video_client = videointelligence.VideoIntelligenceServiceClient() features = [videointelligence.enums.Feature.TEXT_DETECTION] video_context = videointelligence.types.VideoContext() with io.open(path, 'rb') as file: input_content = file.read() operation = video_client.annotate_video( input_content=input_content, # the bytes of the video file features=features, video_context=video_context) print('\nProcessing video for text detection.') result = operation.result(timeout=300) # The first result is retrieved because a single video was processed. annotation_result = result.annotation_results[0] # Get only the first result text_annotation = annotation_result.text_annotations[0] print('\nText: {}'.format(text_annotation.text)) # Get the first text segment text_segment = text_annotation.segments[0] start_time = text_segment.segment.start_time_offset end_time = text_segment.segment.end_time_offset print('start_time: {}, end_time: {}'.format( start_time.seconds + start_time.nanos * 1e-9, end_time.seconds + end_time.nanos * 1e-9)) print('Confidence: {}'.format(text_segment.confidence)) # Show the result for the first frame in this segment. frame = text_segment.frames[0] time_offset = frame.time_offset print( 'Time offset for the first frame: {}'.format(time_offset.seconds + time_offset.nanos * 1e-9)) print('Rotated Bounding Box Vertices:') for vertex in frame.rotated_bounding_box.vertices: print('\tVertex.x: {}, Vertex.y: {}'.format(vertex.x, vertex.y)) # [END video_detect_text_beta] return annotation_result.text_annotations
def analyze_shots(path): # [START video_analyze_shots] """ Detects camera shot changes. """ video_client = videointelligence.VideoIntelligenceServiceClient() features = [videointelligence.enums.Feature.SHOT_CHANGE_DETECTION] operation = video_client.annotate_video(path, features=features) print('\nProcessing video for shot change annotations:') result = operation.result(timeout=90) print('\nFinished processing.') # first result is retrieved because a single video was processed for i, shot in enumerate(result.annotation_results[0].shot_annotations): start_time = (shot.start_time_offset.seconds + shot.start_time_offset.nanos / 1e9) end_time = (shot.end_time_offset.seconds + shot.end_time_offset.nanos / 1e9) print('\tShot {}: {} to {}'.format(i, start_time, end_time))
def process_videoaudio_in_gcs(gcs_filepath): print('[ INFO ] Transcribing video audio from {}'.format(gcs_filepath)) video_client = videointelligence.VideoIntelligenceServiceClient() features = [videointelligence.enums.Feature.SPEECH_TRANSCRIPTION] config = videointelligence.types.SpeechTranscriptionConfig( language_code='en-US', #maxAlternatives=1, #filterProfanity=False, #speechContexts=..., #audioTracks=0, enable_automatic_punctuation=True) video_context = videointelligence.types.VideoContext( speech_transcription_config=config) operation = video_client.annotate_video(gcs_filepath, features=features, video_context=video_context) result = operation.result(timeout=180) # There is only one annotation_result since only one video is processed. annotation_results = result.annotation_results[0] speech_transcription = annotation_results.speech_transcriptions[0] alternatives = speech_transcription.alternatives text_blob = '' for alternative in alternatives: print('Transcript: {}'.format(alternative.transcript)) print('Confidence: {}\n'.format(alternative.confidence)) print('Word level information:') for word_info in alternative.words: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time print('\t{}s - {}s: {}'.format( start_time.seconds + start_time.nanos * 1e-9, end_time.seconds + end_time.nanos * 1e-9, word)) text_blob = text_blob + ' ' + alternative.transcript return annotation_results, text_blob
def video_detect_text_gcs(input_uri): # [START video_detect_text_gcs_beta] """Detect text in a video stored on GCS.""" from google.cloud import videointelligence_v1p2beta1 as videointelligence video_client = videointelligence.VideoIntelligenceServiceClient() features = [videointelligence.enums.Feature.TEXT_DETECTION] operation = video_client.annotate_video(input_uri=input_uri, features=features) print("\nProcessing video for text detection.") result = operation.result(timeout=300) # The first result is retrieved because a single video was processed. annotation_result = result.annotation_results[0] # Get only the first result text_annotation = annotation_result.text_annotations[0] print("\nText: {}".format(text_annotation.text)) # Get the first text segment text_segment = text_annotation.segments[0] start_time = text_segment.segment.start_time_offset end_time = text_segment.segment.end_time_offset print("start_time: {}, end_time: {}".format( start_time.seconds + start_time.nanos * 1e-9, end_time.seconds + end_time.nanos * 1e-9, )) print("Confidence: {}".format(text_segment.confidence)) # Show the result for the first frame in this segment. frame = text_segment.frames[0] time_offset = frame.time_offset print( "Time offset for the first frame: {}".format(time_offset.seconds + time_offset.nanos * 1e-9)) print("Rotated Bounding Box Vertices:") for vertex in frame.rotated_bounding_box.vertices: print("\tVertex.x: {}, Vertex.y: {}".format(vertex.x, vertex.y)) # [END video_detect_text_gcs_beta] return annotation_result.text_annotations
def analyze_explicit_content(path): # [START video_analyze_explicit_content] """ Detects explicit content from the GCS path to a video. """ video_client = videointelligence.VideoIntelligenceServiceClient() features = [videointelligence.enums.Feature.EXPLICIT_CONTENT_DETECTION] operation = video_client.annotate_video(path, features=features) print('\nProcessing video for explicit content annotations:') result = operation.result(timeout=90) print('\nFinished processing.') # first result is retrieved because a single video was processed for frame in result.annotation_results[0].explicit_annotation.frames: likelihood = videointelligence.enums.Likelihood( frame.pornography_likelihood) frame_time = frame.time_offset.seconds + frame.time_offset.nanos / 1e9 print('Time: {}s'.format(frame_time)) print('\tpornography: {}'.format(likelihood.name))
def test_annotate_video_exception(self): # Setup Response error = status_pb2.Status() operation = operations_pb2.Operation( name='operations/test_annotate_video_exception', done=True) operation.error.CopyFrom(error) # Mock the API response channel = ChannelStub(responses=[operation]) client = videointelligence_v1p1beta1.VideoIntelligenceServiceClient( channel=channel) # Setup Request input_uri = 'gs://demomaker/cat.mp4' features_element = enums.Feature.LABEL_DETECTION features = [features_element] response = client.annotate_video( input_uri=input_uri, features=features) exception = response.exception() assert exception.errors[0] == error
def speech_transcription(input_uri): # [START video_speech_transcription_gcs_beta] """Transcribe speech from a video stored on GCS.""" from google.cloud import videointelligence_v1p1beta1 as videointelligence video_client = videointelligence.VideoIntelligenceServiceClient() features = [videointelligence.enums.Feature.SPEECH_TRANSCRIPTION] config = videointelligence.types.SpeechTranscriptionConfig( language_code='en-US', enable_automatic_punctuation=True) video_context = videointelligence.types.VideoContext( speech_transcription_config=config) operation = video_client.annotate_video(input_uri, features=features, video_context=video_context) print('\nProcessing video for speech transcription.') result = operation.result(timeout=180) # There is only one annotation_result since only # one video is processed. annotation_results = result.annotation_results[0] speech_transcription = annotation_results.speech_transcriptions[0] alternative = speech_transcription.alternatives[0] print('Transcript: {}'.format(alternative.transcript)) print('Confidence: {}\n'.format(alternative.confidence)) print('Word level information:') for word_info in alternative.words: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time print('\t{}s - {}s: {}'.format( start_time.seconds + start_time.nanos * 1e-9, end_time.seconds + end_time.nanos * 1e-9, word))
def test_annotate_video_exception(self): # Setup Response error = status_pb2.Status() operation = operations_pb2.Operation( name="operations/test_annotate_video_exception", done=True) operation.error.CopyFrom(error) # Mock the API response channel = ChannelStub(responses=[operation]) patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = videointelligence_v1p1beta1.VideoIntelligenceServiceClient( ) # Setup Request features_element = enums.Feature.LABEL_DETECTION features = [features_element] input_uri = "gs://cloud-samples-data/video/cat.mp4" response = client.annotate_video(features, input_uri=input_uri) exception = response.exception() assert exception.errors[0] == error
def analyze_labels_file(path): # [START video_analyze_labels] """Detect labels given a file path.""" video_client = videointelligence.VideoIntelligenceServiceClient() features = [videointelligence.enums.Feature.LABEL_DETECTION] with io.open(path, 'rb') as movie: input_content = movie.read() operation = video_client.annotate_video(features=features, input_content=input_content) print('\nProcessing video for label annotations:') result = operation.result(timeout=90) print('\nFinished processing.') # Process video/segment level label annotations segment_labels = result.annotation_results[0].segment_label_annotations for i, segment_label in enumerate(segment_labels): print('Video label description: {}'.format( segment_label.entity.description)) for category_entity in segment_label.category_entities: print('\tLabel category description: {}'.format( category_entity.description)) for i, segment in enumerate(segment_label.segments): start_time = (segment.segment.start_time_offset.seconds + segment.segment.start_time_offset.nanos / 1e9) end_time = (segment.segment.end_time_offset.seconds + segment.segment.end_time_offset.nanos / 1e9) positions = '{}s to {}s'.format(start_time, end_time) confidence = segment.confidence print('\tSegment {}: {}'.format(i, positions)) print('\tConfidence: {}'.format(confidence)) print('\n') # Process shot level label annotations shot_labels = result.annotation_results[0].shot_label_annotations for i, shot_label in enumerate(shot_labels): print('Shot label description: {}'.format( shot_label.entity.description)) for category_entity in shot_label.category_entities: print('\tLabel category description: {}'.format( category_entity.description)) for i, shot in enumerate(shot_label.segments): start_time = (shot.segment.start_time_offset.seconds + shot.segment.start_time_offset.nanos / 1e9) end_time = (shot.segment.end_time_offset.seconds + shot.segment.end_time_offset.nanos / 1e9) positions = '{}s to {}s'.format(start_time, end_time) confidence = shot.confidence print('\tSegment {}: {}'.format(i, positions)) print('\tConfidence: {}'.format(confidence)) print('\n') # Process frame level label annotations frame_labels = result.annotation_results[0].frame_label_annotations for i, frame_label in enumerate(frame_labels): print('Frame label description: {}'.format( frame_label.entity.description)) for category_entity in frame_label.category_entities: print('\tLabel category description: {}'.format( category_entity.description)) # Each frame_label_annotation has many frames, # here we print information only about the first frame. frame = frame_label.frames[0] time_offset = frame.time_offset.seconds + frame.time_offset.nanos / 1e9 print('\tFirst frame time offset: {}s'.format(time_offset)) print('\tFirst frame confidence: {}'.format(frame.confidence)) print('\n')
def analyze_labels(path): # [START video_analyze_labels_gcs] """ Detects labels given a GCS path. """ video_client = videointelligence.VideoIntelligenceServiceClient() features = [videointelligence.enums.Feature.LABEL_DETECTION] mode = videointelligence.enums.LabelDetectionMode.SHOT_AND_FRAME_MODE config = videointelligence.types.LabelDetectionConfig( label_detection_mode=mode) context = videointelligence.types.VideoContext( label_detection_config=config) operation = video_client.annotate_video(path, features=features, video_context=context) print('\nProcessing video for label annotations:') result = operation.result(timeout=90) print('\nFinished processing.') # Process video/segment level label annotations segment_labels = result.annotation_results[0].segment_label_annotations for i, segment_label in enumerate(segment_labels): print('Video label description: {}'.format( segment_label.entity.description)) for category_entity in segment_label.category_entities: print('\tLabel category description: {}'.format( category_entity.description)) for i, segment in enumerate(segment_label.segments): start_time = (segment.segment.start_time_offset.seconds + segment.segment.start_time_offset.nanos / 1e9) end_time = (segment.segment.end_time_offset.seconds + segment.segment.end_time_offset.nanos / 1e9) positions = '{}s to {}s'.format(start_time, end_time) confidence = segment.confidence print('\tSegment {}: {}'.format(i, positions)) print('\tConfidence: {}'.format(confidence)) print('\n') # Process shot level label annotations shot_labels = result.annotation_results[0].shot_label_annotations for i, shot_label in enumerate(shot_labels): print('Shot label description: {}'.format( shot_label.entity.description)) for category_entity in shot_label.category_entities: print('\tLabel category description: {}'.format( category_entity.description)) for i, shot in enumerate(shot_label.segments): start_time = (shot.segment.start_time_offset.seconds + shot.segment.start_time_offset.nanos / 1e9) end_time = (shot.segment.end_time_offset.seconds + shot.segment.end_time_offset.nanos / 1e9) positions = '{}s to {}s'.format(start_time, end_time) confidence = shot.confidence print('\tSegment {}: {}'.format(i, positions)) print('\tConfidence: {}'.format(confidence)) print('\n') # Process frame level label annotations frame_labels = result.annotation_results[0].frame_label_annotations for i, frame_label in enumerate(frame_labels): print('Frame label description: {}'.format( frame_label.entity.description)) for category_entity in frame_label.category_entities: print('\tLabel category description: {}'.format( category_entity.description)) # Each frame_label_annotation has many frames, # here we print information only about the first frame. frame = frame_label.frames[0] time_offset = (frame.time_offset.seconds + frame.time_offset.nanos / 1e9) print('\tFirst frame time offset: {}s'.format(time_offset)) print('\tFirst frame confidence: {}'.format(frame.confidence)) print('\n')