Пример #1
0
def lambda_handler(event, context):
    print("We got the following event:\n", event)
    try:
        if "ProxyEncode" in event["Input"]["Media"]:
            s3bucket = event["Input"]["Media"]["ProxyEncode"]["S3Bucket"]
            s3key = event["Input"]["Media"]["ProxyEncode"]["S3Key"]
        elif "Video" in event["Input"]["Media"]:
            s3bucket = event["Input"]["Media"]["Video"]["S3Bucket"]
            s3key = event["Input"]["Media"]["Video"]["S3Key"]
        elif "Image" in event["Input"]["Media"]:
            s3bucket = event["Input"]["Media"]["Image"]["S3Bucket"]
            s3key = event["Input"]["Media"]["Image"]["S3Key"]
        workflow_id = str(event["WorkflowExecutionId"])
        asset_id = event['AssetId']
    except Exception:
        output_object.update_workflow_status("Error")
        output_object.add_workflow_metadata(LabelDetectionError="No valid inputs")
        raise MasExecutionError(output_object.return_output_object())
    print("Processing s3://"+s3bucket+"/"+s3key)
    valid_video_types = [".avi", ".mp4", ".mov"]
    valid_image_types = [".png", ".jpg", ".jpeg"]
    file_type = os.path.splitext(s3key)[1].lower()
    if file_type in valid_image_types:
        # Image processing is synchronous.
        response = detect_labels(s3bucket, urllib.parse.unquote_plus(s3key))
        output_object.add_workflow_metadata(AssetId=asset_id,WorkflowExecutionId=workflow_id)
        dataplane = DataPlane()
        metadata_upload = dataplane.store_asset_metadata(asset_id, operator_name, workflow_id, response)
        if "Status" not in metadata_upload:
            output_object.update_workflow_status("Error")
            output_object.add_workflow_metadata(
                LabelDetectionError="Unable to upload metadata for asset: {asset}".format(asset=asset_id))
            raise MasExecutionError(output_object.return_output_object())
        else:
            if metadata_upload["Status"] == "Success":
                print("Uploaded metadata for asset: {asset}".format(asset=asset_id))
                output_object.update_workflow_status("Complete")
                return output_object.return_output_object()
            elif metadata_upload["Status"] == "Failed":
                output_object.update_workflow_status("Error")
                output_object.add_workflow_metadata(
                    LabelDetectionError="Unable to upload metadata for asset: {asset}".format(asset=asset_id))
                raise MasExecutionError(output_object.return_output_object())
            else:
                output_object.update_workflow_status("Error")
                output_object.add_workflow_metadata(
                    LabelDetectionError="Unable to upload metadata for asset: {asset}".format(asset=asset_id))
                raise MasExecutionError(output_object.return_output_object())
    elif file_type in valid_video_types:
        # Video processing is asynchronous.
        job_id = start_label_detection(s3bucket, urllib.parse.unquote_plus(s3key))
        output_object.update_workflow_status("Executing")
        output_object.add_workflow_metadata(JobId=job_id, AssetId=asset_id, WorkflowExecutionId=workflow_id)
        return output_object.return_output_object()
    else:
        print("ERROR: invalid file type")
        output_object.update_workflow_status("Error")
        output_object.add_workflow_metadata(LabelDetectionError="Not a valid file type")
        raise MasExecutionError(output_object.return_output_object())
def lambda_handler(event, context):
    print("We got this event:\n", event)
    operator_object = MediaInsightsOperationHelper(event)
    # If Transcribe wasn't run due to silent audio, then we're done
    if "Mediainfo_num_audio_tracks" in event["Input"]["MetaData"] and event[
            "Input"]["MetaData"]["Mediainfo_num_audio_tracks"] == "0":
        operator_object.update_workflow_status("Complete")
        return operator_object.return_output_object()
    try:
        job_id = operator_object.metadata["TranscribeJobId"]
        workflow_id = operator_object.workflow_execution_id
        asset_id = operator_object.asset_id
    except KeyError as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            TranscribeError="Missing a required metadata key {e}".format(e=e))
        raise MasExecutionError(operator_object.return_output_object())
    try:
        response = transcribe.get_transcription_job(
            TranscriptionJobName=job_id)
        print(response)
    except Exception as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(TranscribeError=str(e),
                                              TranscribeJobId=job_id)
        raise MasExecutionError(operator_object.return_output_object())
    else:
        if response["TranscriptionJob"][
                "TranscriptionJobStatus"] == "IN_PROGRESS":
            operator_object.update_workflow_status("Executing")
            operator_object.add_workflow_metadata(
                TranscribeJobId=job_id,
                AssetId=asset_id,
                WorkflowExecutionId=workflow_id)
            return operator_object.return_output_object()
        elif response["TranscriptionJob"][
                "TranscriptionJobStatus"] == "FAILED":
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(
                TranscribeJobId=job_id,
                TranscribeError=str(
                    response["TranscriptionJob"]["FailureReason"]))
            raise MasExecutionError(operator_object.return_output_object())
        elif response["TranscriptionJob"][
                "TranscriptionJobStatus"] == "COMPLETED":
            transcribe_uri = response["TranscriptionJob"]["Transcript"][
                "TranscriptFileUri"]
            http = urllib3.PoolManager()
            transcription = http.request('GET', transcribe_uri)
            transcription_data = transcription.data.decode("utf-8")
            transcription_json = json.loads(transcription_data)

            text_only_transcript = ''

            for transcripts in transcription_json["results"]["transcripts"]:
                transcript = transcripts["transcript"]
                text_only_transcript = text_only_transcript.join(transcript)

            print(text_only_transcript)

            dataplane = DataPlane()
            s3 = boto3.client('s3')

            transcript_storage_path = dataplane.generate_media_storage_path(
                asset_id, workflow_id)

            key = transcript_storage_path['S3Key'] + "transcript.txt"
            bucket = transcript_storage_path['S3Bucket']

            s3.put_object(Bucket=bucket, Key=key, Body=text_only_transcript)

            transcription_json["TextTranscriptUri"] = {
                "S3Bucket": bucket,
                "S3Key": key
            }

            metadata_upload = dataplane.store_asset_metadata(
                asset_id, operator_object.name, workflow_id,
                transcription_json)
            if "Status" not in metadata_upload:
                operator_object.add_workflow_metadata(
                    TranscribeError=
                    "Unable to upload metadata for asset: {asset}".format(
                        asset=asset_id),
                    TranscribeJobId=job_id)
                operator_object.update_workflow_status("Error")
                raise MasExecutionError(operator_object.return_output_object())
            else:
                if metadata_upload['Status'] == 'Success':
                    operator_object.add_media_object('Text',
                                                     metadata_upload['Bucket'],
                                                     metadata_upload['Key'])
                    operator_object.add_workflow_metadata(
                        TranscribeJobId=job_id)
                    operator_object.update_workflow_status("Complete")
                    return operator_object.return_output_object()
                else:
                    operator_object.add_workflow_metadata(
                        TranscribeError=
                        "Unable to upload metadata for asset: {asset}".format(
                            asset=asset_id),
                        TranscribeJobId=job_id)
                    operator_object.update_workflow_status("Error")
                    raise MasExecutionError(
                        operator_object.return_output_object())
        else:
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(
                TranscribeError="Unable to determine status")
            raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context):
    print("We got the following event:\n", event)
    try:
        status = event["Status"]
        asset_id = event['MetaData']['AssetId']
    except KeyError as e:
        output_object.update_workflow_status("Error")
        output_object.add_workflow_metadata(LabelDetectionError="Missing key {e}".format(e=e))
        raise MasExecutionError(output_object.return_output_object())
    # Images will have already been processed, so return if job status is already set.
    if status == "Complete":
        output_object.update_workflow_status("Complete")
        return output_object.return_output_object()
    try:
        job_id = event["MetaData"]["JobId"]
        workflow_id = event["MetaData"]["WorkflowExecutionId"]
    except KeyError as e:
        output_object.update_workflow_status("Error")
        output_object.add_workflow_metadata(LabelDetectionError="Missing a required metadata key {e}".format(e=e))
        raise MasExecutionError(output_object.return_output_object())
    # Check rekognition job status:
    dataplane = DataPlane()
    pagination_token = ''
    is_paginated = False
    # If pagination token is in event["MetaData"] then use that to start
    # reading reko results from where this Lambda's previous invocation left off.
    if ("PageToken" in event["MetaData"]):
        pagination_token = event["MetaData"]["PageToken"]
        is_paginated = True
    # Read and persist 10 reko pages per invocation of this Lambda
    for page_number in range(11):
        # Get reko results
        print("job id: " + job_id + " page token: " + pagination_token)
        try:
            response = rek.get_label_detection(JobId=job_id, NextToken=pagination_token)
        except rek.exceptions.InvalidPaginationTokenException as e:
            # Trying to reverse seek to the last valid pagination token would be difficult
            # to implement, so in the rare case that a pagination token expires we'll
            # just start over by reading from the first page.
            print(e)
            print("WARNING: Invalid pagination token found. Restarting read from first page.")
            pagination_token=''
            continue
        # If the reko job is IN_PROGRESS then return. We'll check again after a step function wait.
        if response['JobStatus'] == "IN_PROGRESS":
            output_object.update_workflow_status("Executing")
            output_object.add_workflow_metadata(JobId=job_id, AssetId=asset_id, WorkflowExecutionId=workflow_id)
            return output_object.return_output_object()
        # If the reko job is FAILED then mark the workflow status as Error and return.
        elif response['JobStatus'] == "FAILED":
            output_object.update_workflow_status("Error")
            output_object.add_workflow_metadata(JobId=job_id, LabelDetectionError=str(response["StatusMessage"]))
            raise MasExecutionError(output_object.return_output_object())
        # If the reko job is SUCCEEDED then save this current reko page result
        # and continue to next page_number.
        elif response['JobStatus'] == "SUCCEEDED":
            # If reko results contain more pages then save this page and continue to the next page
            if 'NextToken' in response:
                is_paginated = True
                # Persist rekognition results (current page)
                metadata_upload = dataplane.store_asset_metadata(asset_id=asset_id, operator_name=operator_name, workflow_id=workflow_id, results=response, paginate=True, end=False)
                # If dataplane request succeeded then get the next pagination token and continue.
                if "Status" in metadata_upload and metadata_upload["Status"] == "Success":
                    # Log that this page has been successfully uploaded to the dataplane
                    print("Uploaded metadata for asset: {asset}, job {JobId}, page {page}".format(asset=asset_id, JobId=job_id, page=pagination_token))
                    # Get the next pagination token:
                    pagination_token = response['NextToken']
                    # In order to avoid Lambda timeouts, we're only going to persist 10 pages then
                    # pass the pagination token to the workflow metadata and let our step function
                    # invoker restart this Lambda. The pagination token allows this Lambda
                    # continue from where it left off.
                    if page_number == 10:
                        output_object.update_workflow_status("Executing")
                        output_object.add_workflow_metadata(PageToken=pagination_token, JobId=job_id, AssetId=asset_id, WorkflowExecutionId=workflow_id)
                        return output_object.return_output_object()
                # If dataplane request failed then mark workflow as failed
                else:
                    output_object.update_workflow_status("Error")
                    output_object.add_workflow_metadata(LabelDetectionError="Unable to upload metadata for asset: {asset}".format(asset=asset_id), JobId=job_id)
                    raise MasExecutionError(output_object.return_output_object())
            # If reko results contain no more pages then save this page and mark the stage complete
            else:
                # If we've been saving pages, then tell dataplane this is the last page
                if is_paginated:
                    metadata_upload = dataplane.store_asset_metadata(asset_id=asset_id, operator_name=operator_name, workflow_id=workflow_id, results=response, paginate=True, end=True)
                # If there is only one page then save to dataplane without dataplane options
                else:
                    metadata_upload = dataplane.store_asset_metadata(asset_id=asset_id, operator_name=operator_name, workflow_id=workflow_id, results=response)
                # If dataplane request succeeded then mark the stage complete
                if "Status" in metadata_upload and metadata_upload["Status"] == "Success":
                    print("Uploaded metadata for asset: {asset}, job {JobId}, page {page}".format(asset=asset_id, JobId=job_id, page=pagination_token))
                    output_object.add_workflow_metadata(JobId=job_id)
                    output_object.update_workflow_status("Complete")
                    return output_object.return_output_object()
                # If dataplane request failed then mark workflow as failed
                else:
                    output_object.update_workflow_status("Error")
                    output_object.add_workflow_metadata(LabelDetectionError="Unable to upload metadata for {asset}: {error}".format(asset=asset_id, error=metadata_upload))
                    output_object.add_workflow_metadata(JobId=job_id)
                    raise MasExecutionError(output_object.return_output_object())
        # If reko job failed then mark workflow as failed
        else:
            output_object.update_workflow_status("Error")
            output_object.add_workflow_metadata(LabelDetectionError="Unable to determine status")
            raise MasExecutionError(output_object.return_output_object())
Пример #4
0
def lambda_handler(event, context):
    print("We got the following event:\n", event)

    operator_object = MediaInsightsOperationHelper(event)

    try:
        bucket = operator_object.input["Media"]["Text"]["S3Bucket"]
        key = operator_object.input["Media"]["Text"]["S3Key"]
    except KeyError as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            TranslateError="No valid inputs {e}".format(e=e))
        raise MasExecutionError(operator_object.return_output_object())

    try:
        workflow_id = operator_object.workflow_execution_id
    except KeyError as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            TranslateError="Missing a required metadata key {e}".format(e=e))
        raise MasExecutionError(operator_object.return_output_object())

    try:
        asset_id = operator_object.asset_id
    except KeyError:
        print('No asset id for this workflow')
        asset_id = ''

    try:
        source_lang = operator_object.configuration["SourceLanguageCode"]
        target_lang = operator_object.configuration["TargetLanguageCode"]
    except KeyError:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            TranslateError="Language codes are not defined")
        raise MasExecutionError(operator_object.return_output_object())

    try:
        s3_response = s3.get_object(Bucket=bucket, Key=key)
        transcribe_metadata = json.loads(
            s3_response["Body"].read().decode("utf-8"))
        transcript = transcribe_metadata["results"]["transcripts"][0][
            "transcript"]
    except Exception as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            TranslateError="Unable to read transcription from S3: {e}".format(
                e=str(e)))
        raise MasExecutionError(operator_object.return_output_object())

    # If input text is empty then we're done.
    if len(transcript) < 1:
        operator_object.update_workflow_status("Complete")
        return operator_object.return_output_object()

    # Tell the NLTK data loader to look for files in /tmp/
    nltk.data.path.append("/tmp/")
    # Download NLTK tokenizers to /tmp/
    # We use /tmp because that's where AWS Lambda provides write access to the local file system.
    nltk.download('punkt', download_dir='/tmp/')
    # Create language tokenizer according to user-specified source language.
    # Default to English.
    if source_lang == 'fr':
        print("Using French dictionary to find sentence boundaries.")
        tokenizer = nltk.data.load('tokenizers/punkt/french.pickle')
    elif source_lang == 'de':
        print("Using German dictionary to find sentence boundaries.")
        tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
    elif source_lang == 're':
        print("Using Russian dictionary to find sentence boundaries.")
        tokenizer = nltk.data.load('tokenizers/punkt/russian.pickle')
    elif source_lang == 'it':
        print("Using Italian dictionary to find sentence boundaries.")
        tokenizer = nltk.data.load('tokenizers/punkt/italian.pickle')
    elif source_lang == 'pt':
        print("Using Portuguese dictionary to find sentence boundaries.")
        tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
    elif source_lang == 'es':
        print("Using Spanish dictionary to find sentence boundaries.")
        tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
    else:
        print("Using English dictionary to find sentence boundaries.")
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    # Split input text into a list of sentences
    sentences = tokenizer.tokenize(transcript)
    print("Input text length: " + str(len(transcript)))
    print("Number of sentences: " + str(len(sentences)))
    translated_text = ''
    transcript_chunk = ''
    for sentence in sentences:
        # Translate can handle 5000 unicode characters but we'll process no
        # more than 1000 just to be on the safe side.
        # Even by limiting input text to 3000 characters, we've still seen
        # translate throttling with a RateExceeded exception.
        # Reducing input text to 1000 characters seemed to fix this.
        if (len(sentence) + len(transcript_chunk) < 1000):
            transcript_chunk = transcript_chunk + ' ' + sentence
        else:
            try:
                print("Translation input text length: " +
                      str(len(transcript_chunk)))
                translation_chunk = translate_client.translate_text(
                    Text=transcript_chunk,
                    SourceLanguageCode=source_lang,
                    TargetLanguageCode=target_lang)
                print("Translation output text length: " +
                      str(len(translation_chunk)))
            except Exception as e:
                operator_object.update_workflow_status("Error")
                operator_object.add_workflow_metadata(
                    TranslateError="Unable to get response from translate: {e}"
                    .format(e=str(e)))
                raise MasExecutionError(operator_object.return_output_object())
            translated_text = translated_text + ' ' + translation_chunk[
                "TranslatedText"]
            transcript_chunk = sentence
    print("Translating the final chunk of input text...")
    try:
        print("Translation input text length: " + str(len(transcript_chunk)))
        translation_chunk = translate_client.translate_text(
            Text=transcript_chunk,
            SourceLanguageCode=source_lang,
            TargetLanguageCode=target_lang)
        print("Translation output text length: " + str(len(translation_chunk)))
    except Exception as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            TranslateError="Unable to get response from translate: {e}".format(
                e=str(e)))
        raise MasExecutionError(operator_object.return_output_object())
    translated_text = translated_text + ' ' + translation_chunk[
        "TranslatedText"]
    # Put final result into a JSON object because the MIE dataplane requires it to be so.
    translation_result = {}
    translation_result["TranslatedText"] = translated_text
    translation_result["SourceLanguageCode"] = source_lang
    translation_result["TargetLanguageCode"] = target_lang
    print("Final translation text length: " + str(len(translated_text)))
    dataplane = DataPlane()
    metadata_upload = dataplane.store_asset_metadata(asset_id,
                                                     operator_object.name,
                                                     workflow_id,
                                                     translation_result)
    if "Status" not in metadata_upload:
        operator_object.add_workflow_metadata(
            TranslateError="Unable to upload metadata for asset: {asset}".
            format(asset=asset_id))
        operator_object.update_workflow_status("Error")
        raise MasExecutionError(operator_object.return_output_object())
    else:
        if metadata_upload['Status'] == 'Success':
            operator_object.add_media_object('Text', metadata_upload['Bucket'],
                                             metadata_upload['Key'])
            operator_object.update_workflow_status("Complete")
            return operator_object.return_output_object()
        else:
            operator_object.add_workflow_metadata(
                TranslateError="Unable to upload metadata for asset: {asset}".
                format(asset=asset_id))
            operator_object.update_workflow_status("Error")
            raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context):
    print("We got this event:\n", event)
    operator_object = MediaInsightsOperationHelper(event)
    try:
        job_id = operator_object.metadata["comprehend_phrases_job_id"]
        asset_id = operator_object.asset_id
        workflow_id = operator_object.workflow_execution_id
        # If Comprehend wasn't run due to empty text input, then we're done
        if job_id == "Empty input --> empty output.":
            operator_object.update_workflow_status("Complete")
            return operator_object.return_output_object()
    except KeyError:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            comprehend_error="No valid job id")
        raise MasExecutionError(operator_object.return_output_object())
    try:
        response = comprehend.list_key_phrases_detection_jobs(Filter={
            'JobName':
            job_id,
        }, )
    except Exception as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            comprehend_error="Unable to get response from comprehend: {e}".
            format(e=e))
        raise MasExecutionError(operator_object.return_output_object())
    else:
        print(response)
        comprehend_status = response["KeyPhrasesDetectionJobPropertiesList"][
            0]["JobStatus"]
        if comprehend_status == "SUBMITTED" or comprehend_status == "IN_PROGRESS":
            operator_object.add_workflow_metadata(
                comprehend_phrases_job_id=job_id)
            operator_object.update_workflow_status("Executing")
            return operator_object.return_output_object()
        elif comprehend_status == "COMPLETED":
            output_uri = response["KeyPhrasesDetectionJobPropertiesList"][0][
                "OutputDataConfig"]["S3Uri"]
            delimeter = '/'
            bucket = delimeter.join(output_uri.split(delimeter)[2:3])
            file_name = output_uri.split(delimeter)[-1]
            key = delimeter.join(
                output_uri.split(delimeter)[3:-1]) + '/' + file_name
            comprehend_tarball = read_from_s3(bucket, key)
            comprehend_data = {
                "LanguageCode":
                response['KeyPhrasesDetectionJobPropertiesList'][0]
                ['LanguageCode'],
                "Results": []
            }
            if comprehend_tarball["Status"] == "Success":
                input_bytes = comprehend_tarball["Object"]
                with tarfile.open(fileobj=BytesIO(input_bytes)) as tf:
                    for member in tf:
                        if member.isfile():
                            comprehend_data["Results"].append(
                                tf.extractfile(member).read().decode('utf-8'))
                dataplane = DataPlane()
                metadata_upload = dataplane.store_asset_metadata(
                    asset_id, "key_phrases", workflow_id, comprehend_data)
                if "Status" not in metadata_upload:
                    operator_object.update_workflow_status("Error")
                    operator_object.add_workflow_metadata(
                        comprehend_error="Unable to store key phrases data {e}"
                        .format(e=metadata_upload))
                    raise MasExecutionError(
                        operator_object.return_output_object())
                else:
                    if metadata_upload["Status"] == "Success":
                        operator_object.add_workflow_metadata(
                            comprehend_entity_job_id=job_id,
                            output_uri=output_uri)
                        operator_object.update_workflow_status("Complete")
                        return operator_object.return_output_object()
                    else:
                        operator_object.update_workflow_status("Error")
                        operator_object.add_workflow_metadata(
                            comprehend_error=
                            "Unable to store key phrases data {e}".format(
                                e=metadata_upload))
                        raise MasExecutionError(
                            operator_object.return_output_object())
            else:
                operator_object.update_workflow_status("Error")
                operator_object.add_workflow_metadata(
                    comprehend_entity_job_id=job_id,
                    comprehend_error="could not retrieve output from s3: {e}".
                    format(e=comprehend_tarball["Message"]))
                raise MasExecutionError(operator_object.return_output_object())
        else:
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(
                comprehend_phrases_job_id=job_id,
                comprehend_error="comprehend returned as failed: {e}".format(
                    e=response["KeyPhrasesDetectionJobPropertiesList"][0]
                    ["Message"]))
            raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context):
    operator_object = MediaInsightsOperationHelper(event)
    # Get operator parameters
    try:
        workflow_id = str(event["WorkflowExecutionId"])
        asset_id = event['AssetId']
        if "Video" in operator_object.input["Media"]:
            bucket = operator_object.input["Media"]["Video"]["S3Bucket"]
            key = operator_object.input["Media"]["Video"]["S3Key"]
            file_type = key.split('.')[-1]
        elif "Audio" in operator_object.input["Media"]:
            bucket = operator_object.input["Media"]["Audio"]["S3Bucket"]
            key = operator_object.input["Media"]["Audio"]["S3Key"]
            file_type = key.split('.')[-1]
        elif "Image" in operator_object.input["Media"]:
            bucket = operator_object.input["Media"]["Image"]["S3Bucket"]
            key = operator_object.input["Media"]["Image"]["S3Key"]
            file_type = key.split('.')[-1]
        elif "Text" in operator_object.input["Media"]:
            bucket = operator_object.input["Media"]["Text"]["S3Bucket"]
            key = operator_object.input["Media"]["Text"]["S3Key"]
            file_type = key.split('.')[-1]
    except Exception:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            GenericDataLookupError="No valid inputs")
        raise MasExecutionError(operator_object.return_output_object())

    # Get the metadata filename
    print("Looking up metadata for s3://" + bucket + "/" + key)
    # Get user-defined location for generic data file
    if "Key" in operator_object.configuration:
        metadata_filename = operator_object.configuration["Key"]
    else:
        operator_object.add_workflow_metadata(
            GenericDataLookupError="Missing S3 key for data file.")
        operator_object.update_workflow_status("Error")
        raise MasExecutionError(operator_object.return_output_object())
    if "Bucket" in operator_object.configuration:
        metadata_bucket = operator_object.configuration["Bucket"]
    else:
        operator_object.add_workflow_metadata(
            GenericDataLookupError="Missing S3 bucket for data file.")
        operator_object.update_workflow_status("Error")
        raise MasExecutionError(operator_object.return_output_object())

    # Get metadata
    s3 = boto3.client('s3')
    try:
        print("Getting data from s3://" + metadata_bucket + "/" +
              metadata_filename)
        data = s3.get_object(Bucket=metadata_bucket, Key=metadata_filename)
        metadata_json = json.loads(data['Body'].read().decode('utf-8'))
    except Exception as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            GenericDataLookupError="Unable read datafile. " + str(e))
        raise MasExecutionError(operator_object.return_output_object())

    # Verify that the metadata is a dict, as required by the dataplane
    if (type(metadata_json) != dict):
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            GenericDataLookupError="Metadata must be of type dict. Found " +
            str(type(metadata_json)) + " instead.")
        raise MasExecutionError(operator_object.return_output_object())

    # Save metadata to dataplane
    operator_object.add_workflow_metadata(AssetId=asset_id,
                                          WorkflowExecutionId=workflow_id)
    dataplane = DataPlane()
    metadata_upload = dataplane.store_asset_metadata(asset_id,
                                                     operator_object.name,
                                                     workflow_id,
                                                     metadata_json)

    # Validate that the metadata was saved to the dataplane
    if "Status" not in metadata_upload:
        operator_object.add_workflow_metadata(
            GenericDataLookupError=
            "Unable to upload metadata for asset: {asset}".format(
                asset=asset_id))
        operator_object.update_workflow_status("Error")
        raise MasExecutionError(operator_object.return_output_object())
    else:
        # Update the workflow status
        if metadata_upload["Status"] == "Success":
            print(
                "Uploaded metadata for asset: {asset}".format(asset=asset_id))
            operator_object.update_workflow_status("Complete")
            return operator_object.return_output_object()
        else:
            operator_object.add_workflow_metadata(
                GenericDataLookupError=
                "Unable to upload metadata for asset: {asset}".format(
                    asset=asset_id))
            operator_object.update_workflow_status("Error")
            raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context):
    try:
        status = event["Status"]
        asset_id = event['MetaData']['AssetId']
    except KeyError as e:
        output_object.update_workflow_status("Error")
        output_object.add_workflow_metadata(
            PersonTrackingError="Missing key {e}".format(e=e))
        raise MasExecutionError(output_object.return_output_object())
    # Images will have already been processed, so return if job status is already set.
    if status == "Complete":
        # TODO: Persist rekognition output
        output_object.update_workflow_status("Complete")
        return output_object.return_output_object()

    try:
        job_id = event["MetaData"]["PersonTrackingJobId"]
        workflow_id = event["MetaData"]["WorkflowExecutionId"]
    except KeyError as e:
        output_object.update_workflow_status("Error")
        output_object.add_workflow_metadata(
            PersonTrackingError="Missing a required metadata key {e}".format(
                e=e))
        raise MasExecutionError(output_object.return_output_object())

    # Check rekognition job status:
    dataplane = DataPlane()
    max_results = 1000
    pagination_token = ''
    finished = False
    # Pagination starts on 1001th result. This while loops through each page.
    while not finished:
        response = rek.get_person_tracking(JobId=job_id,
                                           MaxResults=max_results,
                                           NextToken=pagination_token)

        if response['JobStatus'] == "IN_PROGRESS":
            finished = True
            output_object.update_workflow_status("Executing")
            output_object.add_workflow_metadata(
                PersonTrackingJobId=job_id,
                AssetId=asset_id,
                WorkflowExecutionId=workflow_id)
            return output_object.return_output_object()
        elif response['JobStatus'] == "FAILED":
            finished = True
            output_object.update_workflow_status("Error")
            output_object.add_workflow_metadata(PersonTrackingJobId=job_id,
                                                PersonTrackingError=str(
                                                    response["StatusMessage"]))
            raise MasExecutionError(output_object.return_output_object())
        elif response['JobStatus'] == "SUCCEEDED":
            if 'NextToken' in response:
                pagination_token = response['NextToken']
                # Persist rekognition results (current page)
                metadata_upload = dataplane.store_asset_metadata(
                    asset_id, operator_name, workflow_id, response)
                if "Status" not in metadata_upload:
                    output_object.update_workflow_status("Error")
                    output_object.add_workflow_metadata(
                        PersonTrackingError=
                        "Unable to upload metadata for asset: {asset}".format(
                            asset=asset_id))
                    raise MasExecutionError(
                        output_object.return_output_object())
                else:
                    if metadata_upload["Status"] == "Success":
                        print("Uploaded metadata for asset: {asset}".format(
                            asset=asset_id))
                    elif metadata_upload["Status"] == "Failed":
                        output_object.update_workflow_status("Error")
                        output_object.add_workflow_metadata(
                            PersonTrackingError=
                            "Unable to upload metadata for asset: {asset}".
                            format(asset=asset_id))
                        raise MasExecutionError(
                            output_object.return_output_object())
                    else:
                        output_object.update_workflow_status("Error")
                        output_object.add_workflow_metadata(
                            PersonTrackingError=
                            "Unable to upload metadata for asset: {asset}".
                            format(asset=asset_id))
                        raise MasExecutionError(
                            output_object.return_output_object())
            else:
                finished = True
                # Persist rekognition results
                metadata_upload = dataplane.store_asset_metadata(
                    asset_id, operator_name, workflow_id, response)
                if "Status" not in metadata_upload:
                    output_object.update_workflow_status("Error")
                    output_object.add_workflow_metadata(
                        PersonTrackingError=
                        "Unable to upload metadata for asset: {asset}".format(
                            asset=asset_id))
                    raise MasExecutionError(
                        output_object.return_output_object())
                else:
                    if metadata_upload["Status"] == "Success":
                        print("Uploaded metadata for asset: {asset}".format(
                            asset=asset_id))
                        output_object.update_workflow_status("Complete")
                        return output_object.return_output_object()
                    elif metadata_upload["Status"] == "Failed":
                        output_object.update_workflow_status("Error")
                        output_object.add_workflow_metadata(
                            PersonTrackingError=
                            "Unable to upload metadata for asset: {asset}".
                            format(asset=asset_id))
                        raise MasExecutionError(
                            output_object.return_output_object())
                    else:
                        output_object.update_workflow_status("Error")
                        output_object.add_workflow_metadata(
                            PersonTrackingError=
                            "Unable to upload metadata for asset: {asset}".
                            format(asset=asset_id))
                        output_object.add_workflow_metadata(
                            PersonTrackingJobId=job_id)
                        raise MasExecutionError(
                            output_object.return_output_object())
        else:
            output_object.update_workflow_status("Error")
            output_object.add_workflow_metadata(
                PersonTrackingError="Unable to determine status")
            raise MasExecutionError(output_object.return_output_object())
Пример #8
0
def lambda_handler(event, context):
    print("We got the following event:\n", event)
    operator_object = MediaInsightsOperationHelper(event)
    bucket = ''
    key = ''
    try:
        if "Video" in event["Input"]["Media"]:
            bucket = event["Input"]["Media"]["Video"]["S3Bucket"]
            key = event["Input"]["Media"]["Video"]["S3Key"]
        elif "Image" in event["Input"]["Media"]:
            bucket = event["Input"]["Media"]["Image"]["S3Bucket"]
            key = event["Input"]["Media"]["Image"]["S3Key"]
        workflow_id = str(operator_object.workflow_execution_id)
    except KeyError as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(MediaconvertError="Missing a required metadata key {e}".format(e=e))
        raise MasExecutionError(operator_object.return_output_object())

    # Adding in exception block for now since we aren't guaranteed an asset id will be present, should remove later
    try:
        asset_id = operator_object.asset_id
    except KeyError as e:
        print("No asset id passed in with this workflow", e)
        asset_id = ''

    # Get metadata
    s3_cli = boto3.client("s3", region_name=region, config=Config(signature_version='s3v4', s3={'addressing_style': 'virtual'}))
    metadata_json = {}
    try:
        # The number of seconds that the Signed URL is valid:
        signed_url_expiration = 300
        # Generate a signed URL for reading a file from S3 via HTTPS
        signed_url = s3_cli.generate_presigned_url('get_object', Params={'Bucket': bucket, 'Key': key}, ExpiresIn=signed_url_expiration)
        # Launch MediaInfo
        media_info = MediaInfo.parse(signed_url)
        # Save the result
        metadata_json = json.loads(media_info.to_json())
        # If there's no Video, Audio, Image, or Text data then delete the file.
        track_types = [track['track_type'] for track in metadata_json['tracks']]
        if ('Video' not in track_types and
                'Audio' not in track_types and
                'Image' not in track_types and
                'Text' not in track_types):
            print("ERROR: File does not contain valid video, audio, image, or text content")
            print("Deleting file s3://" + bucket + "/" + key)
            s3_cli.delete_object(Bucket=bucket, Key=key)
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(MediainfoError="File does not contain valid video, audio, image, or text content")
            raise MasExecutionError(operator_object.return_output_object())
    except RuntimeError as e:
        # If MediaInfo could not run then we assume it is not a valid
        # media file and delete it
        print("Exception:\n", e)
        print("ERROR: File does not contain valid video, audio, image, or text content")
        print("Deleting file s3://" + bucket + "/" + key)
        s3_cli.delete_object(Bucket=bucket, Key=key)
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(MediainfoError="File does not contain valid video, audio, image, or text content")
        raise MasExecutionError(operator_object.return_output_object())
    except Exception as e:
        print("Exception:\n", e)
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(MediainfoError="Unable to get Mediainfo results. " + str(e))
        raise MasExecutionError(operator_object.return_output_object())

    # Verify that the metadata is a dict, as required by the dataplane
    if type(metadata_json) != dict:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            MediainfoError="Metadata must be of type dict. Found " + str(type(metadata_json)) + " instead.")
        raise MasExecutionError(operator_object.return_output_object())

    # Pass metadata to downstream operators
    # Number of audio tracks is used by the Transcribe operator
    num_audio_tracks = len(list(filter(lambda i: i['track_type'] == 'Audio', metadata_json['tracks'])))
    operator_object.add_workflow_metadata(Mediainfo_num_audio_tracks=str(num_audio_tracks))

    # Save metadata to dataplane
    operator_object.add_workflow_metadata(AssetId=asset_id, WorkflowExecutionId=workflow_id)
    dataplane = DataPlane()
    metadata_upload = dataplane.store_asset_metadata(asset_id, operator_object.name, workflow_id, metadata_json)

    # Validate that the metadata was saved to the dataplane
    if "Status" not in metadata_upload:
        operator_object.add_workflow_metadata(
            MediainfoError="Unable to upload metadata for asset: {asset}".format(asset=asset_id))
        operator_object.update_workflow_status("Error")
        raise MasExecutionError(operator_object.return_output_object())
    else:
        # Update the workflow status
        if metadata_upload["Status"] == "Success":
            print("Uploaded metadata for asset: {asset}".format(asset=asset_id))
            operator_object.update_workflow_status("Complete")
            return operator_object.return_output_object()
        else:
            operator_object.add_workflow_metadata(
                MediainfoError="Unable to upload metadata for asset: {asset}".format(asset=asset_id))
            operator_object.update_workflow_status("Error")
            raise MasExecutionError(operator_object.return_output_object())
Пример #9
0
def lambda_handler(event, context):
        print("We got the following event:\n", event)

        operator_object = MediaInsightsOperationHelper(event)

        try:
            bucket = operator_object.input["Media"]["Text"]["S3Bucket"]
            key = operator_object.input["Media"]["Text"]["S3Key"]
        except KeyError as e:
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(TranslateError="No valid inputs {e}".format(e=e))
            raise MasExecutionError(operator_object.return_output_object())

        try:
            workflow_id = operator_object.workflow_execution_id
        except KeyError as e:
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(TranslateError="Missing a required metadata key {e}".format(e=e))
            raise MasExecutionError(operator_object.return_output_object())

        try:
            asset_id = operator_object.asset_id
        except KeyError:
            print('No asset id for this workflow')
            asset_id = ''

        try:
            source_lang = operator_object.configuration["SourceLanguageCode"]
            target_lang = operator_object.configuration["TargetLanguageCode"]
        except KeyError:
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(TranslateError="Language codes are not defined")
            raise MasExecutionError(operator_object.return_output_object())

        try:
            s3_response = s3.get_object(Bucket=bucket, Key=key)
            transcribe_metadata = json.loads(s3_response["Body"].read().decode("utf-8"))
            transcript = transcribe_metadata["results"]["transcripts"][0]["transcript"]
        except Exception as e:
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(TranslateError="Unable to read transcription from S3: {e}".format(e=str(e)))
            raise MasExecutionError(operator_object.return_output_object())

        try:
            translation = translate_client.translate_text(
                Text=transcript,
                SourceLanguageCode=source_lang,
                TargetLanguageCode=target_lang
            )
        except Exception as e:
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(TranslateError="Unable to get response from translate: {e}".format(e=str(e)))
            raise MasExecutionError(operator_object.return_output_object())
        else:

            dataplane = DataPlane()
            metadata_upload = dataplane.store_asset_metadata(asset_id, operator_object.name, workflow_id,
                                                             translation)
            if "Status" not in metadata_upload:
                operator_object.add_workflow_metadata(
                    TranslateError="Unable to upload metadata for asset: {asset}".format(asset=asset_id))
                operator_object.update_workflow_status("Error")
                raise MasExecutionError(operator_object.return_output_object())
            else:
                if metadata_upload['Status'] == 'Success':
                    operator_object.add_media_object('Text', metadata_upload['Bucket'], metadata_upload['Key'])
                    operator_object.update_workflow_status("Complete")
                    return operator_object.return_output_object()
                else:
                    operator_object.add_workflow_metadata(
                        TranslateError="Unable to upload metadata for asset: {asset}".format(asset=asset_id))
                    operator_object.update_workflow_status("Error")
                    raise MasExecutionError(operator_object.return_output_object())