def lambda_handler(event, context):
    print("We got this event:\n", event)
    operator_object = MediaInsightsOperationHelper(event)
    try:
        workflow_id = operator_object.workflow_execution_id
    except KeyError as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(comprehend_error="Missing a required metadata key {e}".format(e=e))
        raise MasExecutionError(operator_object.return_output_object())
    try:
        bucket = operator_object.input["Media"]["Text"]["S3Bucket"]
        key = operator_object.input["Media"]["Text"]["S3Key"]
        # If operator_object.input["Media"]["Text"]["S3Key"] is a json file,
        # then we're working with metadata about the text file and need to
        # get the actual transcript text from the TextTranscriptUri field.
        # Otherwise we assume operator_object.input["Media"]["Text"]["S3Key"]
        # contains only the transcript text.
        file_ext = str(key.split('.')[-1])
        if file_ext == "json":
            obj = s3.get_object(
                Bucket=bucket,
                Key=key
            )
            results = obj['Body'].read().decode('utf-8')
            results_json = json.loads(results)
            try:
                uri_data = results_json["TextTranscriptUri"]
            except KeyError:
                raise MasExecutionError("JSON can only be passed in from AWS transcribe")
            else:
                bucket = uri_data['S3Bucket']
                key = uri_data['S3Key']
        uri = "s3://" + bucket + '/' + key
        # If input text is empty then we're done.
        response = s3.head_object(Bucket=bucket, Key=key)
        if response['ContentLength'] < 1:
            operator_object.update_workflow_status("Complete")
            operator_object.add_workflow_metadata(comprehend_phrases_job_id="Empty input --> empty output.")
            return operator_object.return_output_object()
    except KeyError:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(comprehend_error="No valid inputs")
        raise MasExecutionError(operator_object.return_output_object())
    try:
        asset_id = operator_object.asset_id
    except KeyError:
        print('No asset id for this workflow')
        asset_id = ''
    dataplane = DataPlane()
    output_uri_request = dataplane.generate_media_storage_path(asset_id, workflow_id)
    output_uri = "s3://{bucket}/{key}".format(bucket=output_uri_request["S3Bucket"], key=output_uri_request["S3Key"] + "/comprehend_phrases")
    try:
        comprehend.start_key_phrases_detection_job(
            InputDataConfig={
                'S3Uri': uri,
                'InputFormat': 'ONE_DOC_PER_FILE'
            },
            OutputDataConfig={
                'S3Uri': output_uri
            },
            DataAccessRoleArn=comprehend_role,
            JobName=workflow_id,
            LanguageCode='en'
        )
    except Exception as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(comprehend_error="Unable to get response from comprehend: {e}".format(e=str(e)))
        raise MasExecutionError(operator_object.return_output_object())
    else:
        comprehend_job_id = workflow_id
        operator_object.add_workflow_metadata(comprehend_phrases_job_id=comprehend_job_id, output_uri=output_uri)
        operator_object.update_workflow_status('Executing')
        return operator_object.return_output_object()
示例#2
0
def lambda_handler(event, context):
    print("We got the following event:\n", event)
    try:
        if "Video" in event["Input"]["Media"]:
            s3bucket = event["Input"]["Media"]["ProxyEncode"]["S3Bucket"]
            s3key = event["Input"]["Media"]["ProxyEncode"]["S3Key"]
        elif "Image" in event["Input"]["Media"]:
            s3bucket = event["Input"]["Media"]["Image"]["S3Bucket"]
            s3key = event["Input"]["Media"]["Image"]["S3Key"]
        workflow_id = str(event["WorkflowExecutionId"])
        asset_id = event['AssetId']
    except Exception:
        output_object.update_workflow_status("Error")
        output_object.add_workflow_metadata(
            ContentModerationError="No valid inputs")
        raise MasExecutionError(output_object.return_output_object())
    print("Processing s3://" + s3bucket + "/" + s3key)
    valid_video_types = [".avi", ".mp4", ".mov"]
    valid_image_types = [".png", ".jpg", ".jpeg"]
    file_type = os.path.splitext(s3key)[1].lower()
    if file_type in valid_image_types:
        # Image processing is synchronous.
        response = detect_moderation_labels(s3bucket,
                                            urllib.parse.unquote_plus(s3key))
        output_object.add_workflow_metadata(AssetId=asset_id,
                                            WorkflowExecutionId=workflow_id)
        dataplane = DataPlane()
        metadata_upload = dataplane.store_asset_metadata(
            asset_id, operator_name, workflow_id, response)
        if "Status" not in metadata_upload:
            output_object.update_workflow_status("Error")
            output_object.add_workflow_metadata(
                ContentModerationError=
                "Unable to upload metadata for asset: {asset}".format(
                    asset=asset_id))
            raise MasExecutionError(output_object.return_output_object())
        else:
            if metadata_upload["Status"] == "Success":
                print("Uploaded metadata for asset: {asset}".format(
                    asset=asset_id))
                output_object.update_workflow_status("Complete")
                return output_object.return_output_object()
            elif metadata_upload["Status"] == "Failed":
                output_object.update_workflow_status("Error")
                output_object.add_workflow_metadata(
                    ContentModerationError=
                    "Unable to upload metadata for asset: {asset}".format(
                        asset=asset_id))
                raise MasExecutionError(output_object.return_output_object())
            else:
                output_object.update_workflow_status("Error")
                output_object.add_workflow_metadata(
                    ContentModerationError=
                    "Unable to upload metadata for asset: {asset}".format(
                        asset=asset_id))
                raise MasExecutionError(output_object.return_output_object())
    elif file_type in valid_video_types:
        job_id = start_content_moderation(s3bucket,
                                          urllib.parse.unquote_plus(s3key))
        output_object.update_workflow_status("Executing")
        output_object.add_workflow_metadata(JobId=job_id,
                                            AssetId=asset_id,
                                            WorkflowExecutionId=workflow_id)
        return output_object.return_output_object()
    else:
        print("ERROR: invalid file type")
        output_object.update_workflow_status("Error")
        output_object.add_workflow_metadata(
            ContentModerationError="Not a valid file type")
        raise MasExecutionError(output_object.return_output_object())
def lambda_handler(event, context):
    print("We got the following event:\n", event)
    try:
        status = event["Status"]
        asset_id = event['MetaData']['AssetId']
    except KeyError as e:
        output_object.update_workflow_status("Error")
        output_object.add_workflow_metadata(
            CelebrityRecognitionError="Missing key {e}".format(e=e))
        raise MasExecutionError(output_object.return_output_object())
    # Images will have already been processed, so return if job status is already set.
    if status == "Complete":
        output_object.update_workflow_status("Complete")
        return output_object.return_output_object()
    try:
        job_id = event["MetaData"]["JobId"]
        workflow_id = event["MetaData"]["WorkflowExecutionId"]
    except KeyError as e:
        output_object.update_workflow_status("Error")
        output_object.add_workflow_metadata(
            CelebrityRecognitionError="Missing a required metadata key {e}".
            format(e=e))
        raise MasExecutionError(output_object.return_output_object())
    # Check rekognition job status:
    dataplane = DataPlane()
    pagination_token = ''
    is_paginated = False
    # If pagination token is in event["MetaData"] then use that to start
    # reading reko results from where this Lambda's previous invocation left off.
    if ("PageToken" in event["MetaData"]):
        pagination_token = event["MetaData"]["PageToken"]
        is_paginated = True
    # Read and persist 10 reko pages per invocation of this Lambda
    for page_number in range(11):
        # Get reko results
        print("job id: " + job_id + " page token: " + pagination_token)
        try:
            response = rek.get_celebrity_recognition(
                JobId=job_id, NextToken=pagination_token)
        except rek.exceptions.InvalidPaginationTokenException as e:
            # Trying to reverse seek to the last valid pagination token would be difficult
            # to implement, so in the rare case that a pagination token expires we'll
            # just start over by reading from the first page.
            print(e)
            print(
                "WARNING: Invalid pagination token found. Restarting read from first page."
            )
            pagination_token = ''
            continue
        # If the reko job is IN_PROGRESS then return. We'll check again after a step function wait.
        if response['JobStatus'] == "IN_PROGRESS":
            output_object.update_workflow_status("Executing")
            output_object.add_workflow_metadata(
                JobId=job_id,
                AssetId=asset_id,
                WorkflowExecutionId=workflow_id)
            return output_object.return_output_object()
        # If the reko job is FAILED then mark the workflow status as Error and return.
        elif response['JobStatus'] == "FAILED":
            output_object.update_workflow_status("Error")
            output_object.add_workflow_metadata(JobId=job_id,
                                                CelebrityRecognitionError=str(
                                                    response["StatusMessage"]))
            raise MasExecutionError(output_object.return_output_object())
        # If the reko job is SUCCEEDED then save this current reko page result
        # and continue to next page_number.
        elif response['JobStatus'] == "SUCCEEDED":
            # If reko results contain more pages then save this page and continue to the next page
            if 'NextToken' in response:
                is_paginated = True
                # Persist rekognition results (current page)
                metadata_upload = dataplane.store_asset_metadata(
                    asset_id=asset_id,
                    operator_name=operator_name,
                    workflow_id=workflow_id,
                    results=response,
                    paginate=True,
                    end=False)
                # If dataplane request succeeded then get the next pagination token and continue.
                if "Status" in metadata_upload and metadata_upload[
                        "Status"] == "Success":
                    # Log that this page has been successfully uploaded to the dataplane
                    print(
                        "Uploaded metadata for asset: {asset}, job {JobId}, page {page}"
                        .format(asset=asset_id,
                                JobId=job_id,
                                page=pagination_token))
                    # Get the next pagination token:
                    pagination_token = response['NextToken']
                    # In order to avoid Lambda timeouts, we're only going to persist 10 pages then
                    # pass the pagination token to the workflow metadata and let our step function
                    # invoker restart this Lambda. The pagination token allows this Lambda
                    # continue from where it left off.
                    if page_number == 10:
                        output_object.update_workflow_status("Executing")
                        output_object.add_workflow_metadata(
                            PageToken=pagination_token,
                            JobId=job_id,
                            AssetId=asset_id,
                            WorkflowExecutionId=workflow_id)
                        return output_object.return_output_object()
                # If dataplane request failed then mark workflow as failed
                else:
                    output_object.update_workflow_status("Error")
                    output_object.add_workflow_metadata(
                        CelebrityRecognitionError=
                        "Unable to upload metadata for asset: {asset}".format(
                            asset=asset_id),
                        JobId=job_id)
                    raise MasExecutionError(
                        output_object.return_output_object())
            # If reko results contain no more pages then save this page and mark the stage complete
            else:
                # If we've been saving pages, then tell dataplane this is the last page
                if is_paginated:
                    metadata_upload = dataplane.store_asset_metadata(
                        asset_id=asset_id,
                        operator_name=operator_name,
                        workflow_id=workflow_id,
                        results=response,
                        paginate=True,
                        end=True)
                # If there is only one page then save to dataplane without dataplane options
                else:
                    metadata_upload = dataplane.store_asset_metadata(
                        asset_id=asset_id,
                        operator_name=operator_name,
                        workflow_id=workflow_id,
                        results=response)
                # If dataplane request succeeded then mark the stage complete
                if "Status" in metadata_upload and metadata_upload[
                        "Status"] == "Success":
                    print("Uploaded metadata for asset: {asset}".format(
                        asset=asset_id))
                    output_object.add_workflow_metadata(JobId=job_id)
                    output_object.update_workflow_status("Complete")
                    return output_object.return_output_object()
                # If dataplane request failed then mark workflow as failed
                else:
                    output_object.update_workflow_status("Error")
                    output_object.add_workflow_metadata(
                        CelebrityRecognitionError=
                        "Unable to upload metadata for {asset}: {error}".
                        format(asset=asset_id, error=metadata_upload))
                    output_object.add_workflow_metadata(JobId=job_id)
                    raise MasExecutionError(
                        output_object.return_output_object())
        # If reko job failed then mark workflow as failed
        else:
            output_object.update_workflow_status("Error")
            output_object.add_workflow_metadata(
                CelebrityRecognitionError="Unable to determine status")
            raise MasExecutionError(output_object.return_output_object())
def lambda_handler(event, context):
    print("We got this event:\n", event)
    operator_object = MediaInsightsOperationHelper(event)
    try:
        workflow_id = operator_object.workflow_execution_id
    except KeyError as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            comprehend_error="Missing a required metadata key {e}".format(e=e))
        raise MasExecutionError(operator_object.return_output_object())
    try:
        bucket = operator_object.input["Media"]["Text"]["S3Bucket"]
        key = operator_object.input["Media"]["Text"]["S3Key"]
        # If operator_object.input["Media"]["Text"]["S3Key"] is a json file,
        # then we're working with metadata about the text file and need to
        # get the actual transcript text from the TextTranscriptUri field.
        # Otherwise we assume operator_object.input["Media"]["Text"]["S3Key"]
        # contains only the transcript text.
        file_ext = str(key.split('.')[-1])
        if file_ext == "json":
            obj = s3.get_object(Bucket=bucket, Key=key)
            results = obj['Body'].read().decode('utf-8')
            results_json = json.loads(results)
            try:
                uri_data = results_json["TextTranscriptUri"]
            except KeyError:
                raise MasExecutionError(
                    "JSON can only be passed in from AWS transcribe")
            else:
                bucket = uri_data['S3Bucket']
                key = uri_data['S3Key']
        uri = "s3://" + bucket + '/' + key
        # If input text is empty then we're done.
        response = s3.head_object(Bucket=bucket, Key=key)
        # If a KmsKey is specified as an input to this operator, then use that
        # to enable encryption in the Comprehend job.
        kms_key_id = ""
        if "KmsKeyId" in operator_object.configuration:
            kms_key_id = operator_object.configuration["KmsKeyId"]
            print(
                "Found a KMS Key Id. Encryption will be enabled in the Comprehend job."
            )
        else:
            print(
                "No KMS Key was specified. Encryption will not be enabled in the Comprehend job."
            )
        if response['ContentLength'] < 1:
            operator_object.update_workflow_status("Complete")
            operator_object.add_workflow_metadata(
                comprehend_entity_job_id="Empty input --> empty output.")
            return operator_object.return_output_object()
    except KeyError:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            comprehend_error="No valid inputs")
        raise MasExecutionError(operator_object.return_output_object())
    try:
        asset_id = operator_object.asset_id
    except KeyError:
        print('No asset id for this workflow')
        asset_id = ''
    dataplane = DataPlane()
    output_uri_request = dataplane.generate_media_storage_path(
        asset_id, workflow_id)
    output_uri = "s3://{bucket}/{key}".format(
        bucket=output_uri_request["S3Bucket"],
        key=output_uri_request["S3Key"] + '/comprehend_entities')
    try:
        if kms_key_id != '':
            # If the user specified a KMS key then enable comprehend job encryption.
            comprehend.start_entities_detection_job(
                InputDataConfig={
                    "S3Uri": uri,
                    "InputFormat": "ONE_DOC_PER_FILE"
                },
                OutputDataConfig={
                    "S3Uri": output_uri,
                    "KmsKeyId": kms_key_id
                },
                DataAccessRoleArn=comprehend_role,
                VolumeKmsKeyId=kms_key_id,
                JobName=workflow_id,
                LanguageCode="en")
        else:
            comprehend.start_entities_detection_job(
                InputDataConfig={
                    "S3Uri": uri,
                    "InputFormat": "ONE_DOC_PER_FILE"
                },
                OutputDataConfig={"S3Uri": output_uri},
                DataAccessRoleArn=comprehend_role,
                JobName=workflow_id,
                LanguageCode="en")
    except Exception as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            comprehend_error="Unable to get response from comprehend: {e}".
            format(e=str(e)))
        raise MasExecutionError(operator_object.return_output_object())
    else:
        comprehend_job_id = workflow_id
        operator_object.add_workflow_metadata(
            comprehend_entity_job_id=comprehend_job_id,
            entity_output_uri=output_uri)
        operator_object.update_workflow_status('Executing')
        return operator_object.return_output_object()
示例#5
0
def lambda_handler(event, context):
    print("We got this event:\n", event)

    operator_object = MediaInsightsOperationHelper(event)

    try:
        workflow_id = operator_object.workflow_execution_id
        asset_id = operator_object.asset_id
        job_id = workflow_id
    except KeyError:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            comprehend_error="No valid job id")
        raise MasExecutionError(operator_object.return_output_object())
    try:
        response = comprehend.list_key_phrases_detection_jobs(Filter={
            'JobName':
            job_id,
        }, )
    except Exception as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            comprehend_error="Unable to get response from comprehend: {e}".
            format(e=e))
        raise MasExecutionError(operator_object.return_output_object())
    else:
        print(response)
        comprehend_status = response["KeyPhrasesDetectionJobPropertiesList"][
            0]["JobStatus"]
        if comprehend_status == "SUBMITTED" or comprehend_status == "IN_PROGRESS":
            operator_object.add_workflow_metadata(
                comprehend_phrases_job_id=job_id)
            operator_object.update_workflow_status("Executing")
            return operator_object.return_output_object()
        elif comprehend_status == "COMPLETED":
            output_uri = response["KeyPhrasesDetectionJobPropertiesList"][0][
                "OutputDataConfig"]["S3Uri"]

            delimeter = '/'

            bucket = delimeter.join(output_uri.split(delimeter)[2:3])
            file_name = output_uri.split(delimeter)[-1]
            key = delimeter.join(
                output_uri.split(delimeter)[3:-1]) + '/' + file_name

            comprehend_tarball = read_from_s3(bucket, key)

            comprehend_data = {
                "LanguageCode":
                response['KeyPhrasesDetectionJobPropertiesList'][0]
                ['LanguageCode'],
                "Results": []
            }

            if comprehend_tarball["Status"] == "Success":
                input_bytes = comprehend_tarball["Object"]
                with tarfile.open(fileobj=BytesIO(input_bytes)) as tf:
                    for member in tf:
                        if member.isfile():
                            comprehend_data["Results"].append(
                                tf.extractfile(member).read().decode('utf-8'))

                dataplane = DataPlane()

                metadata_upload = dataplane.store_asset_metadata(
                    asset_id, "key_phrases", workflow_id, comprehend_data)

                if "Status" not in metadata_upload:
                    operator_object.update_workflow_status("Error")
                    operator_object.add_workflow_metadata(
                        comprehend_error="Unable to store key phrases data {e}"
                        .format(e=metadata_upload))
                    raise MasExecutionError(
                        operator_object.return_output_object())
                else:
                    if metadata_upload["Status"] == "Success":
                        operator_object.update_workflow_status("Complete")
                        operator_object.add_workflow_metadata(
                            comprehend_entity_job_id=job_id,
                            output_uri=output_uri)
                        operator_object.update_workflow_status("Complete")
                        return operator_object.return_output_object()
                    else:
                        operator_object.update_workflow_status("Error")
                        operator_object.add_workflow_metadata(
                            comprehend_error=
                            "Unable to store key phrases data {e}".format(
                                e=metadata_upload))
                        raise MasExecutionError(
                            operator_object.return_output_object())
            else:
                operator_object.update_workflow_status("Error")
                operator_object.add_workflow_metadata(
                    comprehend_entity_job_id=job_id,
                    comprehend_error="could not retrieve output from s3: {e}".
                    format(e=comprehend_tarball["Message"]))
                raise MasExecutionError(operator_object.return_output_object())
        else:
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(
                comprehend_phrases_job_id=job_id,
                comprehend_error="comprehend returned as failed: {e}".format(
                    e=response["KeyPhrasesDetectionJobPropertiesList"][0]
                    ["Message"]))
            raise MasExecutionError(operator_object.return_output_object())
import html
import webvtt
from io import StringIO
from botocore import config
from urllib.parse import urlparse
from datetime import datetime

from MediaInsightsEngineLambdaHelper import MediaInsightsOperationHelper
from MediaInsightsEngineLambdaHelper import MasExecutionError
from MediaInsightsEngineLambdaHelper import DataPlane

s3 = boto3.client('s3')
s3_resource = boto3.resource('s3')
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
headers = {"Content-Type": "application/json"}
dataplane = DataPlane()

mie_config = json.loads(os.environ['botoConfig'])
config = config.Config(**mie_config)
translate_client = boto3.client('translate', config=config)
polly = boto3.client('polly', config=config)


class WebCaptions:
    def __init__(self, operator_object):
        """
        :param event: The event passed in to the operator

        """
        print("WebCaptions operator_object = {}".format(operator_object))
        self.operator_object = operator_object
示例#7
0
def lambda_handler(event, context):

    operator_object = MediaInsightsOperationHelper(event)

    try:
        job_id = operator_object.metadata["TranscribeJobId"]
        workflow_id = operator_object.workflow_execution_id
        asset_id = operator_object.asset_id
    except KeyError as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            TranscribeError="Missing a required metadata key {e}".format(e=e))
        raise MasExecutionError(operator_object.return_output_object())

    try:
        response = transcribe.get_transcription_job(
            TranscriptionJobName=job_id)
        print(response)
    except Exception as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(TranscribeError=str(e),
                                              TranscribeJobId=job_id)
        raise MasExecutionError(operator_object.return_output_object())
    else:
        if response["TranscriptionJob"][
                "TranscriptionJobStatus"] == "IN_PROGRESS":
            operator_object.update_workflow_status("Executing")
            operator_object.add_workflow_metadata(
                TranscribeJobId=job_id,
                AssetId=asset_id,
                WorkflowExecutionId=workflow_id)
            return operator_object.return_output_object()
        elif response["TranscriptionJob"][
                "TranscriptionJobStatus"] == "FAILED":
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(
                TranscribeJobId=job_id,
                TranscribeError=str(
                    response["TranscriptionJob"]["FailureReason"]))
            raise MasExecutionError(operator_object.return_output_object())
        elif response["TranscriptionJob"][
                "TranscriptionJobStatus"] == "COMPLETED":
            transcribe_uri = response["TranscriptionJob"]["Transcript"][
                "TranscriptFileUri"]
            http = urllib3.PoolManager()
            transcription = http.request('GET', transcribe_uri)
            transcription_data = transcription.data.decode("utf-8")
            transcription_json = json.loads(transcription_data)

            text_only_transcript = ''

            for transcripts in transcription_json["results"]["transcripts"]:
                transcript = transcripts["transcript"]
                text_only_transcript = text_only_transcript.join(transcript)

            print(text_only_transcript)

            dataplane = DataPlane()
            s3 = boto3.client('s3')

            transcript_storage_path = dataplane.generate_media_storage_path(
                asset_id, workflow_id)

            key = transcript_storage_path['S3Key'] + "transcript.txt"
            bucket = transcript_storage_path['S3Bucket']

            s3.put_object(Bucket=bucket, Key=key, Body=text_only_transcript)

            transcription_json["TextTranscriptUri"] = {
                "S3Bucket": bucket,
                "S3Key": key
            }

            metadata_upload = dataplane.store_asset_metadata(
                asset_id, operator_object.name, workflow_id,
                transcription_json)
            if "Status" not in metadata_upload:
                operator_object.add_workflow_metadata(
                    TranscribeError=
                    "Unable to upload metadata for asset: {asset}".format(
                        asset=asset_id),
                    TranscribeJobId=job_id)
                operator_object.update_workflow_status("Error")
                raise MasExecutionError(operator_object.return_output_object())
            else:
                if metadata_upload['Status'] == 'Success':
                    operator_object.add_media_object('Text',
                                                     metadata_upload['Bucket'],
                                                     metadata_upload['Key'])
                    operator_object.add_workflow_metadata(
                        TranscribeJobId=job_id)
                    operator_object.update_workflow_status("Complete")
                    return operator_object.return_output_object()
                else:
                    operator_object.add_workflow_metadata(
                        TranscribeError=
                        "Unable to upload metadata for asset: {asset}".format(
                            asset=asset_id),
                        TranscribeJobId=job_id)
                    operator_object.update_workflow_status("Error")
                    raise MasExecutionError(
                        operator_object.return_output_object())
        else:
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(
                TranscribeError="Unable to determine status")
            raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context):
    print("We got the following event:\n", event)
    operator_object = MediaInsightsOperationHelper(event)
    bucket = ''
    key = ''
    try:
        if "Video" in event["Input"]["Media"]:
            bucket = event["Input"]["Media"]["Video"]["S3Bucket"]
            key = event["Input"]["Media"]["Video"]["S3Key"]
        elif "Image" in event["Input"]["Media"]:
            bucket = event["Input"]["Media"]["Image"]["S3Bucket"]
            key = event["Input"]["Media"]["Image"]["S3Key"]
        workflow_id = str(operator_object.workflow_execution_id)
    except KeyError as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(MediaconvertError="Missing a required metadata key {e}".format(e=e))
        raise MasExecutionError(operator_object.return_output_object())

    # Adding in exception block for now since we aren't guaranteed an asset id will be present, should remove later
    try:
        asset_id = operator_object.asset_id
    except KeyError as e:
        print("No asset id passed in with this workflow", e)
        asset_id = ''

    # Get metadata
    s3_cli = boto3.client("s3", region_name=region, config=Config(signature_version='s3v4', s3={'addressing_style': 'virtual'}))
    metadata_json = {}
    try:
        # The number of seconds that the Signed URL is valid:
        signed_url_expiration = 300
        # Generate a signed URL for reading a file from S3 via HTTPS
        signed_url = s3_cli.generate_presigned_url('get_object', Params={'Bucket': bucket, 'Key': key}, ExpiresIn=signed_url_expiration)
        # Launch MediaInfo
        media_info = MediaInfo.parse(signed_url)
        # Save the result
        metadata_json = json.loads(media_info.to_json())
        # If there's no Video, Audio, Image, or Text data then delete the file.
        track_types = [track['track_type'] for track in metadata_json['tracks']]
        if ('Video' not in track_types and
                'Audio' not in track_types and
                'Image' not in track_types and
                'Text' not in track_types):
            print("ERROR: File does not contain valid video, audio, image, or text content")
            print("Deleting file s3://" + bucket + "/" + key)
            s3_cli.delete_object(Bucket=bucket, Key=key)
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(MediainfoError="File does not contain valid video, audio, image, or text content")
            raise MasExecutionError(operator_object.return_output_object())
    except RuntimeError as e:
        # If MediaInfo could not run then we assume it is not a valid
        # media file and delete it
        print("Exception:\n", e)
        print("ERROR: File does not contain valid video, audio, image, or text content")
        print("Deleting file s3://" + bucket + "/" + key)
        s3_cli.delete_object(Bucket=bucket, Key=key)
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(MediainfoError="File does not contain valid video, audio, image, or text content")
        raise MasExecutionError(operator_object.return_output_object())
    except Exception as e:
        print("Exception:\n", e)
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(MediainfoError="Unable to get Mediainfo results. " + str(e))
        raise MasExecutionError(operator_object.return_output_object())

    # Verify that the metadata is a dict, as required by the dataplane
    if type(metadata_json) != dict:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            MediainfoError="Metadata must be of type dict. Found " + str(type(metadata_json)) + " instead.")
        raise MasExecutionError(operator_object.return_output_object())

    # Pass metadata to downstream operators
    # Number of audio tracks is used by the Transcribe operator
    num_audio_tracks = len(list(filter(lambda i: i['track_type'] == 'Audio', metadata_json['tracks'])))
    operator_object.add_workflow_metadata(Mediainfo_num_audio_tracks=str(num_audio_tracks))

    # Save metadata to dataplane
    operator_object.add_workflow_metadata(AssetId=asset_id, WorkflowExecutionId=workflow_id)
    dataplane = DataPlane()
    metadata_upload = dataplane.store_asset_metadata(asset_id, operator_object.name, workflow_id, metadata_json)

    # Validate that the metadata was saved to the dataplane
    if "Status" not in metadata_upload:
        operator_object.add_workflow_metadata(
            MediainfoError="Unable to upload metadata for asset: {asset}".format(asset=asset_id))
        operator_object.update_workflow_status("Error")
        raise MasExecutionError(operator_object.return_output_object())
    else:
        # Update the workflow status
        if metadata_upload["Status"] == "Success":
            print("Uploaded metadata for asset: {asset}".format(asset=asset_id))
            operator_object.update_workflow_status("Complete")
            return operator_object.return_output_object()
        else:
            operator_object.add_workflow_metadata(
                MediainfoError="Unable to upload metadata for asset: {asset}".format(asset=asset_id))
            operator_object.update_workflow_status("Error")
            raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context):
    print("We got the following event:\n", event)
    operator_object = MediaInsightsOperationHelper(event)
    # Get operator parameters
    try:
        workflow_id = str(event["WorkflowExecutionId"])
        asset_id = event['AssetId']
        if "Video" in operator_object.input["Media"]:
            bucket = operator_object.input["Media"]["Video"]["S3Bucket"]
            key = operator_object.input["Media"]["Video"]["S3Key"]
            file_type = key.split('.')[-1]
        elif "Audio" in operator_object.input["Media"]:
            bucket = operator_object.input["Media"]["Audio"]["S3Bucket"]
            key = operator_object.input["Media"]["Audio"]["S3Key"]
            file_type = key.split('.')[-1]
        elif "Image" in operator_object.input["Media"]:
            bucket = operator_object.input["Media"]["Image"]["S3Bucket"]
            key = operator_object.input["Media"]["Image"]["S3Key"]
            file_type = key.split('.')[-1]
        elif "Text" in operator_object.input["Media"]:
            bucket = operator_object.input["Media"]["Text"]["S3Bucket"]
            key = operator_object.input["Media"]["Text"]["S3Key"]
            file_type = key.split('.')[-1]
    except Exception:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            GenericDataLookupError="No valid inputs")
        raise MasExecutionError(operator_object.return_output_object())

    # Get the metadata filename
    print("Looking up metadata for s3://" + bucket + "/" + key)
    # Get user-defined location for generic data file
    if "Key" in operator_object.configuration:
        metadata_filename = operator_object.configuration["Key"]
    else:
        operator_object.add_workflow_metadata(
            GenericDataLookupError="Missing S3 key for data file.")
        operator_object.update_workflow_status("Error")
        raise MasExecutionError(operator_object.return_output_object())
    if "Bucket" in operator_object.configuration:
        metadata_bucket = operator_object.configuration["Bucket"]
    else:
        operator_object.add_workflow_metadata(
            GenericDataLookupError="Missing S3 bucket for data file.")
        operator_object.update_workflow_status("Error")
        raise MasExecutionError(operator_object.return_output_object())

    # Get metadata
    s3 = boto3.client('s3')
    try:
        print("Getting data from s3://" + metadata_bucket + "/" +
              metadata_filename)
        data = s3.get_object(Bucket=metadata_bucket, Key=metadata_filename)
        metadata_json = json.loads(data['Body'].read().decode('utf-8'))
    except Exception as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            GenericDataLookupError="Unable read datafile. " + str(e))
        raise MasExecutionError(operator_object.return_output_object())

    # Verify that the metadata is a dict, as required by the dataplane
    if (type(metadata_json) != dict):
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            GenericDataLookupError="Metadata must be of type dict. Found " +
            str(type(metadata_json)) + " instead.")
        raise MasExecutionError(operator_object.return_output_object())

    # Save metadata to dataplane
    operator_object.add_workflow_metadata(AssetId=asset_id,
                                          WorkflowExecutionId=workflow_id)
    dataplane = DataPlane()
    metadata_upload = dataplane.store_asset_metadata(asset_id,
                                                     operator_object.name,
                                                     workflow_id,
                                                     metadata_json)

    # Validate that the metadata was saved to the dataplane
    if "Status" not in metadata_upload:
        operator_object.add_workflow_metadata(
            GenericDataLookupError=
            "Unable to upload metadata for asset: {asset}".format(
                asset=asset_id))
        operator_object.update_workflow_status("Error")
        raise MasExecutionError(operator_object.return_output_object())
    else:
        # Update the workflow status
        if metadata_upload["Status"] == "Success":
            print(
                "Uploaded metadata for asset: {asset}".format(asset=asset_id))
            operator_object.update_workflow_status("Complete")
            return operator_object.return_output_object()
        else:
            operator_object.add_workflow_metadata(
                GenericDataLookupError=
                "Unable to upload metadata for asset: {asset}".format(
                    asset=asset_id))
            operator_object.update_workflow_status("Error")
            raise MasExecutionError(operator_object.return_output_object())
示例#10
0
def lambda_handler(event, context):
    try:
        status = event["Status"]
        asset_id = event['MetaData']['AssetId']
    except KeyError as e:
        output_object.update_workflow_status("Error")
        output_object.add_workflow_metadata(
            CelebrityRecognitionError="Missing key {e}".format(e=e))
        raise MasExecutionError(output_object.return_output_object())
    # Images will have already been processed, so return if job status is already set.
    if status == "Complete":
        # TODO: Persist rekognition output
        output_object.update_workflow_status("Complete")
        return output_object.return_output_object()

    try:
        job_id = event["MetaData"]["CelebrityRecognitionJobId"]
        workflow_id = event["MetaData"]["WorkflowExecutionId"]
    except KeyError as e:
        output_object.update_workflow_status("Error")
        output_object.add_workflow_metadata(
            CelebrityRecognitionError="Missing a required metadata key {e}".
            format(e=e))
        raise MasExecutionError(output_object.return_output_object())

    # Check rekognition job status:
    rek = boto3.client('rekognition')
    dataplane = DataPlane()
    max_results = 1000
    pagination_token = ''
    finished = False
    # Pagination starts on 1001th result. This while loops through each page.
    while not finished:
        response = rek.get_celebrity_recognition(JobId=job_id,
                                                 MaxResults=max_results,
                                                 NextToken=pagination_token)

        if response['JobStatus'] == "IN_PROGRESS":
            finished = True
            output_object.update_workflow_status("Executing")
            output_object.add_workflow_metadata(
                CelebrityRecognitionJobId=job_id,
                AssetId=asset_id,
                WorkflowExecutionId=workflow_id)
            return output_object.return_output_object()
        elif response['JobStatus'] == "FAILED":
            finished = True
            output_object.update_workflow_status("Error")
            output_object.add_workflow_metadata(
                CelebrityRecognitionJobId=job_id,
                CelebrityRecognitionError=str(response["StatusMessage"]))
            raise MasExecutionError(output_object.return_output_object())
        elif response['JobStatus'] == "SUCCEEDED":
            if 'NextToken' in response:
                pagination_token = response['NextToken']
                # Persist rekognition results (current page)
                metadata_upload = dataplane.store_asset_metadata(
                    asset_id, operator_name, workflow_id, response)
                if "Status" not in metadata_upload:
                    output_object.update_workflow_status("Error")
                    output_object.add_workflow_metadata(
                        CelebrityRecognitionError=
                        "Unable to upload metadata for asset: {asset}".format(
                            asset=asset_id))
                    raise MasExecutionError(
                        output_object.return_output_object())
                else:
                    if metadata_upload["Status"] == "Success":
                        print("Uploaded metadata for asset: {asset}".format(
                            asset=asset_id))
                    elif metadata_upload["Status"] == "Failed":
                        output_object.update_workflow_status("Error")
                        output_object.add_workflow_metadata(
                            CelebrityRecognitionError=
                            "Unable to upload metadata for asset: {asset}".
                            format(asset=asset_id))
                        raise MasExecutionError(
                            output_object.return_output_object())
                    else:
                        output_object.update_workflow_status("Error")
                        output_object.add_workflow_metadata(
                            CelebrityRecognitionError=
                            "Unable to upload metadata for asset: {asset}".
                            format(asset=asset_id))
                        output_object.add_workflow_metadata(
                            PersonTrackingJobId=job_id)
                        raise MasExecutionError(
                            output_object.return_output_object())
            else:
                finished = True
                # Persist rekognition results
                metadata_upload = dataplane.store_asset_metadata(
                    asset_id, operator_name, workflow_id, response)
                if "Status" not in metadata_upload:
                    output_object.update_workflow_status("Error")
                    output_object.add_workflow_metadata(
                        CelebrityRecognitionError=
                        "Unable to upload metadata for asset: {asset}".format(
                            asset=asset_id))
                    raise MasExecutionError(
                        output_object.return_output_object())
                else:
                    if metadata_upload["Status"] == "Success":
                        print("Uploaded metadata for asset: {asset}".format(
                            asset=asset_id))
                    elif metadata_upload["Status"] == "Failed":
                        output_object.update_workflow_status("Error")
                        output_object.add_workflow_metadata(
                            CelebrityRecognitionError=
                            "Unable to upload metadata for asset: {asset}".
                            format(asset=asset_id))
                        raise MasExecutionError(
                            output_object.return_output_object())
                    else:
                        output_object.update_workflow_status("Error")
                        output_object.add_workflow_metadata(
                            CelebrityRecognitionError=
                            "Unable to upload metadata for asset: {asset}".
                            format(asset=asset_id))
                        output_object.add_workflow_metadata(
                            PersonTrackingJobId=job_id)
                        raise MasExecutionError(
                            output_object.return_output_object())
                    output_object.add_workflow_metadata(
                        CelebrityRecognitionJobId=job_id)
                    output_object.update_workflow_status("Complete")
                    return output_object.return_output_object()
        else:
            output_object.update_workflow_status("Error")
            output_object.add_workflow_metadata(
                CelebrityRecognitionError="Unable to determine status")
            raise MasExecutionError(output_object.return_output_object())
示例#11
0
def lambda_handler(event, context):
    print("We got the following event:\n", event)

    operator_object = MediaInsightsOperationHelper(event)

    try:
        bucket = operator_object.input["Media"]["Text"]["S3Bucket"]
        key = operator_object.input["Media"]["Text"]["S3Key"]
    except KeyError as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            TranslateError="No valid inputs {e}".format(e=e))
        raise MasExecutionError(operator_object.return_output_object())

    try:
        workflow_id = operator_object.workflow_execution_id
    except KeyError as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            TranslateError="Missing a required metadata key {e}".format(e=e))
        raise MasExecutionError(operator_object.return_output_object())

    try:
        asset_id = operator_object.asset_id
    except KeyError:
        print('No asset id for this workflow')
        asset_id = ''

    try:
        source_lang = operator_object.configuration["SourceLanguageCode"]
        target_lang = operator_object.configuration["TargetLanguageCode"]
    except KeyError:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            TranslateError="Language codes are not defined")
        raise MasExecutionError(operator_object.return_output_object())

    try:
        s3_response = s3.get_object(Bucket=bucket, Key=key)
        transcribe_metadata = json.loads(
            s3_response["Body"].read().decode("utf-8"))
        transcript = transcribe_metadata["results"]["transcripts"][0][
            "transcript"]
    except Exception as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            TranslateError="Unable to read transcription from S3: {e}".format(
                e=str(e)))
        raise MasExecutionError(operator_object.return_output_object())

    # If input text is empty then we're done.
    if len(transcript) < 1:
        operator_object.update_workflow_status("Complete")
        return operator_object.return_output_object()

    # Tell the NLTK data loader to look for files in /tmp/
    nltk.data.path.append("/tmp/")
    # Download NLTK tokenizers to /tmp/
    # We use /tmp because that's where AWS Lambda provides write access to the local file system.
    nltk.download('punkt', download_dir='/tmp/')
    # Create language tokenizer according to user-specified source language.
    # Default to English.
    if source_lang == 'fr':
        print("Using French dictionary to find sentence boundaries.")
        tokenizer = nltk.data.load('tokenizers/punkt/french.pickle')
    elif source_lang == 'de':
        print("Using German dictionary to find sentence boundaries.")
        tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
    elif source_lang == 're':
        print("Using Russian dictionary to find sentence boundaries.")
        tokenizer = nltk.data.load('tokenizers/punkt/russian.pickle')
    elif source_lang == 'it':
        print("Using Italian dictionary to find sentence boundaries.")
        tokenizer = nltk.data.load('tokenizers/punkt/italian.pickle')
    elif source_lang == 'pt':
        print("Using Portuguese dictionary to find sentence boundaries.")
        tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
    elif source_lang == 'es':
        print("Using Spanish dictionary to find sentence boundaries.")
        tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
    else:
        print("Using English dictionary to find sentence boundaries.")
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    # Split input text into a list of sentences
    sentences = tokenizer.tokenize(transcript)
    print("Input text length: " + str(len(transcript)))
    print("Number of sentences: " + str(len(sentences)))
    translated_text = ''
    transcript_chunk = ''
    for sentence in sentences:
        # Translate can handle 5000 unicode characters but we'll process no
        # more than 1000 just to be on the safe side.
        # Even by limiting input text to 3000 characters, we've still seen
        # translate throttling with a RateExceeded exception.
        # Reducing input text to 1000 characters seemed to fix this.
        if (len(sentence) + len(transcript_chunk) < 1000):
            transcript_chunk = transcript_chunk + ' ' + sentence
        else:
            try:
                print("Translation input text length: " +
                      str(len(transcript_chunk)))
                translation_chunk = translate_client.translate_text(
                    Text=transcript_chunk,
                    SourceLanguageCode=source_lang,
                    TargetLanguageCode=target_lang)
                print("Translation output text length: " +
                      str(len(translation_chunk)))
            except Exception as e:
                operator_object.update_workflow_status("Error")
                operator_object.add_workflow_metadata(
                    TranslateError="Unable to get response from translate: {e}"
                    .format(e=str(e)))
                raise MasExecutionError(operator_object.return_output_object())
            translated_text = translated_text + ' ' + translation_chunk[
                "TranslatedText"]
            transcript_chunk = sentence
    print("Translating the final chunk of input text...")
    try:
        print("Translation input text length: " + str(len(transcript_chunk)))
        translation_chunk = translate_client.translate_text(
            Text=transcript_chunk,
            SourceLanguageCode=source_lang,
            TargetLanguageCode=target_lang)
        print("Translation output text length: " + str(len(translation_chunk)))
    except Exception as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            TranslateError="Unable to get response from translate: {e}".format(
                e=str(e)))
        raise MasExecutionError(operator_object.return_output_object())
    translated_text = translated_text + ' ' + translation_chunk[
        "TranslatedText"]
    # Put final result into a JSON object because the MIE dataplane requires it to be so.
    translation_result = {}
    translation_result["TranslatedText"] = translated_text
    translation_result["SourceLanguageCode"] = source_lang
    translation_result["TargetLanguageCode"] = target_lang
    print("Final translation text length: " + str(len(translated_text)))
    dataplane = DataPlane()
    metadata_upload = dataplane.store_asset_metadata(asset_id,
                                                     operator_object.name,
                                                     workflow_id,
                                                     translation_result)
    if "Status" not in metadata_upload:
        operator_object.add_workflow_metadata(
            TranslateError="Unable to upload metadata for asset: {asset}".
            format(asset=asset_id))
        operator_object.update_workflow_status("Error")
        raise MasExecutionError(operator_object.return_output_object())
    else:
        if metadata_upload['Status'] == 'Success':
            operator_object.add_media_object('Text', metadata_upload['Bucket'],
                                             metadata_upload['Key'])
            operator_object.update_workflow_status("Complete")
            return operator_object.return_output_object()
        else:
            operator_object.add_workflow_metadata(
                TranslateError="Unable to upload metadata for asset: {asset}".
                format(asset=asset_id))
            operator_object.update_workflow_status("Error")
            raise MasExecutionError(operator_object.return_output_object())
示例#12
0
def lambda_handler(event, context):
        print("We got the following event:\n", event)

        operator_object = MediaInsightsOperationHelper(event)

        try:
            bucket = operator_object.input["Media"]["Text"]["S3Bucket"]
            key = operator_object.input["Media"]["Text"]["S3Key"]
        except KeyError as e:
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(TranslateError="No valid inputs {e}".format(e=e))
            raise MasExecutionError(operator_object.return_output_object())

        try:
            workflow_id = operator_object.workflow_execution_id
        except KeyError as e:
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(TranslateError="Missing a required metadata key {e}".format(e=e))
            raise MasExecutionError(operator_object.return_output_object())

        try:
            asset_id = operator_object.asset_id
        except KeyError:
            print('No asset id for this workflow')
            asset_id = ''

        try:
            source_lang = operator_object.configuration["SourceLanguageCode"]
            target_lang = operator_object.configuration["TargetLanguageCode"]
        except KeyError:
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(TranslateError="Language codes are not defined")
            raise MasExecutionError(operator_object.return_output_object())

        try:
            s3_response = s3.get_object(Bucket=bucket, Key=key)
            transcribe_metadata = json.loads(s3_response["Body"].read().decode("utf-8"))
            transcript = transcribe_metadata["results"]["transcripts"][0]["transcript"]
        except Exception as e:
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(TranslateError="Unable to read transcription from S3: {e}".format(e=str(e)))
            raise MasExecutionError(operator_object.return_output_object())

        try:
            translation = translate_client.translate_text(
                Text=transcript,
                SourceLanguageCode=source_lang,
                TargetLanguageCode=target_lang
            )
        except Exception as e:
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(TranslateError="Unable to get response from translate: {e}".format(e=str(e)))
            raise MasExecutionError(operator_object.return_output_object())
        else:

            dataplane = DataPlane()
            metadata_upload = dataplane.store_asset_metadata(asset_id, operator_object.name, workflow_id,
                                                             translation)
            if "Status" not in metadata_upload:
                operator_object.add_workflow_metadata(
                    TranslateError="Unable to upload metadata for asset: {asset}".format(asset=asset_id))
                operator_object.update_workflow_status("Error")
                raise MasExecutionError(operator_object.return_output_object())
            else:
                if metadata_upload['Status'] == 'Success':
                    operator_object.add_media_object('Text', metadata_upload['Bucket'], metadata_upload['Key'])
                    operator_object.update_workflow_status("Complete")
                    return operator_object.return_output_object()
                else:
                    operator_object.add_workflow_metadata(
                        TranslateError="Unable to upload metadata for asset: {asset}".format(asset=asset_id))
                    operator_object.update_workflow_status("Error")
                    raise MasExecutionError(operator_object.return_output_object())
示例#13
0
def lambda_handler(event, context):
    print("We got this event:\n", event)
    operator_object = MediaInsightsOperationHelper(event)
    # If Transcribe wasn't run due to silent audio, then we're done
    if "Mediainfo_num_audio_tracks" in event["Input"]["MetaData"] and event[
            "Input"]["MetaData"]["Mediainfo_num_audio_tracks"] == "0":
        operator_object.update_workflow_status("Complete")
        return operator_object.return_output_object()
    try:
        job_id = operator_object.metadata["TranscribeJobId"]
        workflow_id = operator_object.workflow_execution_id
        asset_id = operator_object.asset_id
    except KeyError as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            TranscribeError="Missing a required metadata key {e}".format(e=e))
        raise MasExecutionError(operator_object.return_output_object())
    try:
        response = transcribe.get_transcription_job(
            TranscriptionJobName=job_id)
        source_language = response['TranscriptionJob']['LanguageCode']
        print("get_transcription_job response:")
        print(response)
    except Exception as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            TranscribeError=str(e),
            TranscribeJobId=job_id,
        )
        raise MasExecutionError(operator_object.return_output_object())
    else:
        if response["TranscriptionJob"][
                "TranscriptionJobStatus"] == "IN_PROGRESS":
            operator_object.update_workflow_status("Executing")
            operator_object.add_workflow_metadata(
                TranscribeJobId=job_id,
                AssetId=asset_id,
                WorkflowExecutionId=workflow_id)
            return operator_object.return_output_object()
        elif response["TranscriptionJob"][
                "TranscriptionJobStatus"] == "FAILED":
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(
                TranscribeJobId=job_id,
                TranscribeError=str(
                    response["TranscriptionJob"]["FailureReason"]))
            raise MasExecutionError(operator_object.return_output_object())
        elif response["TranscriptionJob"][
                "TranscriptionJobStatus"] == "COMPLETED":
            transcribe_uri = response["TranscriptionJob"]["Transcript"][
                "TranscriptFileUri"]
            http = urllib3.PoolManager()
            transcription = http.request('GET', transcribe_uri)
            transcription_data = transcription.data.decode("utf-8")
            transcription_json = json.loads(transcription_data)

            text_only_transcript = ''

            for transcripts in transcription_json["results"]["transcripts"]:
                transcript = transcripts["transcript"]
                text_only_transcript = text_only_transcript.join(transcript)

            print(text_only_transcript)

            dataplane = DataPlane()
            s3 = boto3.client('s3', config=config)

            transcript_storage_path = dataplane.generate_media_storage_path(
                asset_id, workflow_id)

            key = transcript_storage_path['S3Key'] + "transcript.txt"
            bucket = transcript_storage_path['S3Bucket']

            s3.put_object(Bucket=bucket, Key=key, Body=text_only_transcript)

            transcription_json["TextTranscriptUri"] = {
                "S3Bucket": bucket,
                "S3Key": key
            }

            metadata_upload = dataplane.store_asset_metadata(
                asset_id, operator_object.name, workflow_id,
                transcription_json)
            if "Status" not in metadata_upload:
                operator_object.add_workflow_metadata(
                    TranscribeError=
                    "Unable to upload metadata for asset: {asset}".format(
                        asset=asset_id),
                    TranscribeJobId=job_id)
                operator_object.update_workflow_status("Error")
                raise MasExecutionError(operator_object.return_output_object())
            else:
                if metadata_upload['Status'] == 'Success':
                    operator_object.add_media_object('Text',
                                                     metadata_upload['Bucket'],
                                                     metadata_upload['Key'])
                    # The source language may be user-specified or auto-detected by
                    # Transcribe. Either way, pass it to downstream operators as
                    # workflow metadata.
                    operator_object.add_workflow_metadata(
                        TranscribeJobId=job_id,
                        TranscribeSourceLanguage=source_language)

                    operator_object.update_workflow_status("Complete")
                    return operator_object.return_output_object()
                else:
                    operator_object.add_workflow_metadata(
                        TranscribeError=
                        "Unable to upload metadata for asset: {asset}".format(
                            asset=asset_id),
                        TranscribeJobId=job_id)
                    operator_object.update_workflow_status("Error")
                    raise MasExecutionError(
                        operator_object.return_output_object())
        else:
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(
                TranscribeError="Unable to determine status")
            raise MasExecutionError(operator_object.return_output_object())
示例#14
0
def lambda_handler(event, context):
    #print("We got this event:\n", event)

    operator_object = MediaInsightsOperationHelper(event)

    try:
        workflow_id = operator_object.workflow_execution_id
    except KeyError as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(comprehend_error="Missing a required metadata key {e}".format(e=e))
        raise MasExecutionError(operator_object.return_output_object())

    try:
        bucket = operator_object.input["Media"]["Text"]["S3Bucket"]
        key = operator_object.input["Media"]["Text"]["S3Key"]

        file_ext = str(key.split('.')[-1])
        if file_ext == "json":
            s3 = boto3.client('s3')
            obj = s3.get_object(
                Bucket=bucket,
                Key=key
            )
            results = obj['Body'].read().decode('utf-8')
            results_json = json.loads(results)
            try:
                uri_data = results_json["TextTranscriptUri"]
            except KeyError:
                raise MasExecutionError("JSON can only be passed in from AWS transcribe")
            else:
                uri = "s3://" + uri_data['S3Bucket'] + '/' + uri_data['S3Key']
        else:
            uri = "s3://" + bucket + '/' + key

    except KeyError:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(comprehend_error="No valid inputs")
        raise MasExecutionError(operator_object.return_output_object())

    try:
        asset_id = operator_object.asset_id
    except KeyError:
        print('No asset id for this workflow')
        asset_id = ''

    dataplane = DataPlane()

    output_uri_request = dataplane.generate_media_storage_path(asset_id, workflow_id)

    output_uri = "s3://{bucket}/{key}".format(bucket=output_uri_request["S3Bucket"],
                                              key=output_uri_request["S3Key"] + '/comprehend_entities')

    try:
        comprehend.start_entities_detection_job(
            InputDataConfig={
                "S3Uri": uri,
                "InputFormat": "ONE_DOC_PER_FILE"
            },
            OutputDataConfig={
                "S3Uri": output_uri
            },
            DataAccessRoleArn=comprehend_role,
            JobName=workflow_id,
            LanguageCode="en"
        )
    except Exception as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(comprehend_error="Unable to get response from comprehend: {e}".format(e=str(e)))
        raise MasExecutionError(operator_object.return_output_object())
    else:
        comprehend_job_id = workflow_id
        operator_object.add_workflow_metadata(comprehend_entity_job_id=comprehend_job_id, entity_output_uri=output_uri)
        operator_object.update_workflow_status('Executing')
        return operator_object.return_output_object()