def lambda_handler(event, context): session = boto3.session.Session() region = session.region_name # Default to unsuccessful isSuccessful = "FALSE" # Create a random name for the transcription job jobname = id_generator() # Extract the bucket and key from the downloadPodcast lambda function bucket = event['audioS3Location']['bucket'] key = event['audioS3Location']['key'] content_type = event['audio_type'] if content_type not in CONTENT_TYPE_TO_MEDIA_FORMAT: raise InvalidInputError(content_type + " is not supported audio type.") media_type = CONTENT_TYPE_TO_MEDIA_FORMAT[content_type] logger.info("media type: " + content_type) # Assemble the url for the object for transcribe. It must be an s3 url in the region url = "https://s3-" + region + ".amazonaws.com/" + bucket + "/" + key try: showSpeakerLabels = (int(event['speakers']) > 1) settings = { 'VocabularyName': event['vocabularyInfo']['name'], 'ShowSpeakerLabels': False } if int(event['speakers']) > 1: settings['ShowSpeakerLabels'] = True settings['MaxSpeakerLabels'] = int(event['speakers']) # Call the AWS SDK to initiate the transcription job. response = client.start_transcription_job(TranscriptionJobName=jobname, LanguageCode='en-US', Settings=settings, MediaFormat=media_type, Media={'MediaFileUri': url}) isSuccessful = "TRUE" except client.exceptions.BadRequestException as e: # There is a limit to how many transcribe jobs can run concurrently. If you hit this limit, # return unsuccessful and the step function will retry. logger.error(str(e)) raise ThrottlingException(e) except client.exceptions.LimitExceededException as e: # There is a limit to how many transcribe jobs can run concurrently. If you hit this limit, # return unsuccessful and the step function will retry. logger.error(str(e)) raise ThrottlingException(e) except client.exceptions.ClientError as e: # Return the transcription job and the success code # There is a limit to how many transcribe jobs can run concurrently. If you hit this limit, # return unsuccessful and the step function will retry. logger.error(str(e)) raise ThrottlingException(e) return {"success": isSuccessful, "transcribeJob": jobname}
def lambda_handler(event, context): print("Received event" + json.dumps(event, indent=4)) session = boto3.session.Session() region = session.region_name # Default to unsuccessful isSuccessful = "FALSE" # Create a random name for the transcription job jobname = id_generator() # Extract the bucket and key from the lambda function bucket = event['bucket'] key = event['key'] content_type = event['audio_type'] if content_type not in CONTENT_TYPE_TO_MEDIA_FORMAT: raise InvalidInputError(content_type + " is not supported audio type.") media_type = CONTENT_TYPE_TO_MEDIA_FORMAT[content_type] logger.info("media type: " + content_type) # Assemble the url for the object for transcribe. It must be an s3 url in the region url = "https://s3-" + region + ".amazonaws.com/" + bucket + "/" + key try: settings = {'ChannelIdentification': True} print('url: ' + url) # Call the AWS SDK to initiate the transcription job. response = client.start_transcription_job(TranscriptionJobName=jobname, LanguageCode=LANGUAGE_CODE, Settings=settings, MediaFormat=media_type, Media={'MediaFileUri': url}) isSuccessful = "TRUE" except client.exceptions.BadRequestException as e: # There is a limit to how many transcribe jobs can run concurrently. If you hit this limit, # return unsuccessful and the step function will retry. logger.error(str(e)) raise ThrottlingException(e) except client.exceptions.LimitExceededException as e: # There is a limit to how many transcribe jobs can run concurrently. If you hit this limit, # return unsuccessful and the step function will retry. logger.error(str(e)) raise ThrottlingException(e) except client.exceptions.ClientError as e: # Return the transcription job and the success code # There is a limit to how many transcribe jobs can run concurrently. If you hit this limit, # return unsuccessful and the step function will retry. logger.error(str(e)) raise ThrottlingException(e) return {"success": isSuccessful, "transcribeJob": jobname}
def lambda_handler(event, context): url = event['podcastUrl'] bucket = event['bucket'] content_type = event['audio_type'] # generate a temp file name to store in S3 key = 'podcasts/audio/' + id_generator() + "-" + os.path.basename(url) try: logger.info("downloading from: " + url) # Open the url stream = urlopen(url) s3_object_metadata = {'href': url} logger.info("writing to s3://" + bucket + "/" + key) s3_client.upload_fileobj( Fileobj=stream, Bucket=bucket, Key=key, ExtraArgs={ "Metadata": s3_object_metadata, 'ContentType': content_type } ) logger.info("done writing to s3://" + bucket + "/" + key) # Return the bucket and key the location of the podcast file stored in S3 return { "bucket": bucket, "key": key } # handle errors except HTTPError as e: logger.error("HTTPError downloading:" + url) logger.exception(str(e)) raise e except URLError as e: logger.error("URLError downloading:" + url) logger.exception(str(e)) raise e except Exception as e: logger.error("Unexpected error:") logger.exception(str(e)) raise e
def lambda_handler(event, context): """ The first lambda function that runs, triggered by a DynamoDB Transcripts table event Starts the state machine and gives it the key for audio file stored in S3 for audio transcription Does not return any value for another lambda function """ for record in event.get('Records'): if record.get('eventName') in ('INSERT', 'MODIFY'): # Retrieve the item attributes from the stream record Id = record['dynamodb']['NewImage']['id']['S'] Procedure = record['dynamodb']['NewImage']['procedure']['S'] BucketName = record['dynamodb']['NewImage']['fileData']['M'][ 'bucketName']['S'] BucketKey = record['dynamodb']['NewImage']['fileData']['M'][ 'bucketKey']['S'] Jurisdiction = record['dynamodb']['NewImage']['jurisdiction']['S'] Description = record['dynamodb']['NewImage']['description']['S'] FileType = record['dynamodb']['NewImage']['fileType']['S'] FileName = record['dynamodb']['NewImage']['fileName']['S'] request_params = { "dynamoId": Id, "bucketName": BucketName, "bucketKey": BucketKey, "jurisdiction": Jurisdiction, "description": Description, "procedure": Procedure, "fileType": FileType, "fileName": FileName } response = STEPFUNCTIONS_CLIENT.start_execution( stateMachineArn=STEPFUNCTIONS_ARN, name=id_generator(), input=json.dumps(request_params, indent=4, sort_keys=True, default=str)) else: print("Should only expect insert/modify DynamoDB operations")
def process_transcript(transcription_url, agent_name='', agent_arn=''): custom_vocabs = None response = urlopen(transcription_url) output = response.read() json_data = json.loads(output) logger.info(json_data) # customer customer_transcriptions = [] # センテンスを作成する # 1 秒未満に続いた単語は同じセンテンスとし、1 秒以上空いた音声は別センテンスとする for d in json_data['results']['channel_labels']['channels'][0]['items']: if 'start_time' not in d: pass elif customer_transcriptions == [] or float(d['start_time']) - float( customer_transcriptions[-1]['end_time']) >= 1: customer_transcriptions.append({ 'job_name': json_data['jobName'], 'person': 'customer', 'start_time': d['start_time'], 'end_time': d['end_time'], 'content': d['alternatives'][0]['content'], 'detail_flag': True }) elif float(d['start_time']) - float( customer_transcriptions[-1]['end_time']) < 1: # 1秒未満 customer_transcriptions[-1]['end_time'] = d['end_time'] customer_transcriptions[-1]['content'] += d['alternatives'][0][ 'content'] for customer_transcription in customer_transcriptions: customer_transcription['start_time'] = int( float(customer_transcription['start_time']) * 1000) customer_transcription['end_time'] = int( float(customer_transcription['end_time']) * 1000) for i, customer_transcription in enumerate(customer_transcriptions): customer_result = detect_all(customer_transcription['content']) # res = comprehend.detect_sentiment(Text=customer_transcription['content'],LanguageCode=LANGUAGE_CODE) for key in customer_result.keys(): customer_transcriptions[i][key] = customer_result[key] # agent agent_transcriptions = [] # センテンスを作成する # 1 秒未満に続いた単語は同じセンテンスとし、1 秒以上空いた音声は別センテンスとする for d in json_data['results']['channel_labels']['channels'][1]['items']: if 'start_time' not in d: pass elif agent_transcriptions == [] or float(d['start_time']) - float( agent_transcriptions[-1]['end_time']) >= 1: agent_transcriptions.append({ 'job_name': json_data['jobName'], 'person': 'agent', 'start_time': d['start_time'], 'end_time': d['end_time'], 'content': d['alternatives'][0]['content'], 'agent_arn': agent_arn, 'agent_name': agent_name, 'detail_flag': True }) elif float(d['end_time']) - float( agent_transcriptions[-1]['end_time']) < 1: agent_transcriptions[-1]['end_time'] = d['end_time'] agent_transcriptions[-1]['content'] += d['alternatives'][0][ 'content'] for agent_transcription in agent_transcriptions: agent_transcription['start_time'] = int( float(agent_transcription['start_time']) * 1000) agent_transcription['end_time'] = int( float(agent_transcription['end_time']) * 1000) for i, agent_transcription in enumerate(agent_transcriptions): agent_result = detect_all(agent_transcription['content']) # res = comprehend.detect_sentiment(Text=agent_transcription['content'],LanguageCode=LANGUAGE_CODE) for key in agent_result.keys(): agent_transcriptions[i][key] = agent_result[key] # 全体のtranscription ## agent agent_content = '' for item in json_data['results']['channel_labels']['channels'][1]['items']: agent_content += item['alternatives'][0]['content'] agent_content = agent_content.replace(' ', '') ## customer customer_content = '' for item in json_data['results']['channel_labels']['channels'][0]['items']: customer_content += item['alternatives'][0]['content'] customer_content = customer_content.replace(' ', '') ## whole whole_transcription = { 'whole_transcript': json_data['results']['transcripts'][0]['transcript'].replace(' ', ''), 'agent_transcript': agent_content, 'customer_transcript': customer_content, 'job_name': json_data['jobName'], 'agent_arn': agent_arn, 'agent_name': agent_name, 'detail_flag': False, } whole_detect_result = detect_all(whole_transcription['whole_transcript']) for key in whole_detect_result.keys(): whole_transcription['whole_' + key] = whole_detect_result[key] agent_detect_result = detect_all(whole_transcription['agent_transcript']) for key in agent_detect_result.keys(): whole_transcription['agent_' + key] = agent_detect_result[key] customer_detect_result = detect_all( whole_transcription['customer_transcript']) for key in customer_detect_result.keys(): whole_transcription['customer_' + key] = customer_detect_result[key] # s3upload transcript_locations = [] # customer for customer_transcription in customer_transcriptions: key = 'callrecords/transcript/sentence/customer/' + id_generator( ) + '.json' response = s3_client.put_object(Body=json.dumps(customer_transcription, indent=2), Bucket=BUCKET, Key=key) logger.info(json.dumps(response, indent=2)) logger.info("successfully written transcript to s3://" + BUCKET + "/" + key) # Return the bucket and key of the transcription / comprehend result. transcript_locations.append({"bucket": BUCKET, "key": key}) # agent for agent_transcription in agent_transcriptions: key = 'callrecords/transcript/sentence/agent/' + id_generator( ) + '.json' response = s3_client.put_object(Body=json.dumps(agent_transcription, indent=2), Bucket=BUCKET, Key=key) logger.info(json.dumps(response, indent=2)) logger.info("successfully written transcript to s3://" + BUCKET + "/" + key) # Return the bucket and key of the transcription / comprehend result. transcript_locations.append({"bucket": BUCKET, "key": key}) # コール全体のjson保存 key = 'callrecords/transcript/whole/json/' + id_generator() + '.json' response = s3_client.put_object(Body=json.dumps(whole_transcription, indent=2), Bucket=BUCKET, Key=key) logger.info(json.dumps(response, indent=2)) logger.info("successfully written transcript to s3://" + BUCKET + "/" + key) transcript_locations.append({"bucket": BUCKET, "key": key}) logger.info('return value:') logger.info(transcript_locations) return transcript_locations
def lambda_handler(event, context): print("Received event: " + json.dumps(event, indent=2)) # Pull the bucket name from the environment variable set in the cloudformation stack bucket = os.environ['BUCKET_NAME'] retval = [] paragraphs = [] # Pull the signed URL for the payload of the transcription job transcriptionUrl = event['transcribeStatus']['transcriptionUrl'] response = s3_client.get_object( Bucket=event["vocabularyInfo"]['mapping']['bucket'], Key=event["vocabularyInfo"]['mapping']['key']) file_content = response['Body'].read().decode('utf-8') mapping = json.loads(file_content) print("Received mapping: " + json.dumps(mapping, indent=2)) # Open the transcription job payload. f = urlopen(transcriptionUrl) j = json.loads(f.read()) # Here is the JSON returned by the Amazon Transcription SDK # { # "jobName":"JobName", # "accountId":"Your AWS Account Id", # "results":{ # "transcripts":[ # { # "transcript":"ah ... this is the text of the transcript" # } # ], # "items":[ # { # "start_time":"0.630", # "end_time":"5.620", # "alternatives": [ # { # "confidence":"0.7417", # "content":"ah" # } # ], # "type":"pronunciation" # } # ] # } # Pull the items from the transcription. Each word will be its own item with a start and endtime items = j["results"]["items"] # We would like to determine the key phrases in the transcript to so we can search on common phrases # rather than a single word at a time. In order to maintain the relationship between the time # the text is spoken and search on it, we need to pass each phrase individually along with its # timestamp so we retain that relationship. We will use comprehend to extract the ckey phrases from # the text. contents = "" timedata = [] prevEndTime = -1 paragraphGap = 1.5 prevStartTime = -1 newParagraph = False prevSpeaker = 'spk_0' hasSpeakerLabels = False speakerMapping = [] # Create a mapping of the transitions from one speaker to another if 'speaker_labels' in j['results']: hasSpeakerLabels = True for i in range(len(j['results']['speaker_labels']['segments'])): speakerLabel = j['results']['speaker_labels']['segments'][i] speakerMapping.append({ "speakerLabel": speakerLabel['speaker_label'], "endTime": float(speakerLabel['end_time']) }) speakerIndex = 0 # Repeat the loop for each item (word and punctuation) # The transcription will be broken out into a number of sections that are referred to # below as paragraphs. The paragraph is the unit text that is stored in the # elasticsearch index. It is broken out by punctionation, speaker changes, a long pause # in the audio, or overall length for i in range(len(items)): reason = "" # If the transcription detected the end of a sentence, we'll if items[i]['type'] == 'punctuation': if items[i]["alternatives"][0]["content"] == '.': newParagraph = True # Always assume the first guess is right. contents += items[i]["alternatives"][0]["content"] # Add the start time to the string -> timedata if 'start_time' in items[i]: speakerLabel = 'spk_0' if prevStartTime == -1: prevStartTime = float(items[i]["start_time"]) # gap refers to the amount of time between spoken words gap = float(items[i]["start_time"]) - prevEndTime if hasSpeakerLabels: while speakerIndex < (len(speakerMapping) - 1) and speakerMapping[ speakerIndex + 1]['endTime'] < float( items[i]["start_time"]): speakerIndex += 1 speakerLabel = speakerMapping[speakerIndex]['speakerLabel'] # Change paragraphs if the speaker changes if speakerLabel != prevSpeaker: newParagraph = True reason = "Speaker Change from " + prevSpeaker + " to " + speakerLabel # the gap exceeds a preset threshold elif gap > paragraphGap: newParagraph = True reason = "Time gap" # There are over 4900 words (The limit for comprehend is 5000) elif len(contents) > 4900: newParagraph = True reason = "Long paragraph" else: newParagraph = False if prevEndTime != -1 and newParagraph: # append the block of text to the array. Call comprehend to get # the keyword tags for this block of text retval.append({ "startTime": prevStartTime, "endTime": prevEndTime, "text": contents, "gap": gap, "tags": run_comprehend(contents), "reason": reason, "speaker": prevSpeaker, "len": len(contents) }) # Reset the contents and the time mapping # print('paragraph:' + contents) contents = "" timedata = [] prevEndTime = -1 prevStartTime = -1 newParagraph = False else: prevEndTime = float(items[i]["end_time"]) prevSpeaker = speakerLabel # If the contents is not empty, prepend a space if contents != "": contents += " " # Always assume the first guess is right. word = items[i]["alternatives"][0]["content"] # Map the custom words back to their original text for key in mapping: val = mapping[key] word = word.replace(key, val) contents += word # Run Comprehend on the remaining text # run_comprehend(contents, timedata, retval) retval.append({ "startTime": prevStartTime, "endTime": prevEndTime, "text": contents, "tags": run_comprehend(contents), "speaker": prevSpeaker }) # Create a payload for the output of the transcribe and comprehend API calls. There's a limit on the # amount of data stored in a step function payload, so we will use S3 to store the payload instead. # This can get to be pretty big. key = 'podcasts/keywords/' + id_generator() + '.json' # store retval to s3 response = s3_client.put_object(Body=json.dumps(retval, indent=2), Bucket=bucket, Key=key) print("Return Value: " + json.dumps(retval, indent=2)) # Return the bucket and key of the transcription / comprehend result. return {"bucket": bucket, "key": key}
def lambda_handler(event, context): logger.info("Received event: " + json.dumps(event, indent=2)) feed_url = event['rss'] max_episodes_to_process = None if 'maxEpisodesToProcess' in event: max_episodes_to_process = int(event['maxEpisodesToProcess']) maxConcurrentEpisodes = 10 # Open the url and process the RSS feed retval = [] bucket = os.environ['BUCKET_NAME'] episode_count = 0 # This array holds the entity types that are included in the custom vocabulary vocabularyTypes = [ 'COMMERCIAL_ITEM', 'EVENT', 'LOCATION', 'ORGANIZATION', 'TITLE' ] vocabularyItems = [] try: filename = '/tmp/' + id_generator() + '.rss' # HTTP GET the RSS feed XML file f = urlopen(feed_url) # Open our local file for writing with open(filename, "wb") as local_file: local_file.write(f.read()) # The RSS feed is an XML file, so parse it and traverse the tree and pull all the /channel/items tree = ET.parse(filename) root = tree.getroot() # Extract the title of the podcast channelTitle = root.find('channel/title') for child in root.findall('channel/item'): title = child.find('title') envelope = child.find('enclosure') date_entry = child.find('pubDate').text dt = parser.parse(date_entry) date_string = dt.strftime("%Y:%m:%d %H:%M:%S") keywords = [] description = child.find('description').text description = description[0:4900] comprehendResponse = client.detect_entities(Text=description, LanguageCode='en') # we estimate the number of speakers in the podcast by parsing people names from the episode summary speaker_list = [] for i in range(len(comprehendResponse["Entities"])): entity = comprehendResponse["Entities"][i] # For every person mentioned in the description, increment the number of # speakers. This is making the assumption that the episode text will # mention all the speakers and not include mentions to people that # are not in the podcast. # Is isn't critical that this number is correct, it is simply used to break # up the body of the podcast into smaller chunks. If the speaker detection # is inaccurate, it doesn't have a major impact on the functionality of # the system. if entity['Type'] == 'PERSON': if not entity['Text'].startswith('@'): speaker_list.append(entity['Text']) else: logger.info(f'skipping person {entity["Text"]}') # add to vocabulary if not already in there if entity['Type'] in vocabularyTypes and not entity[ 'Text'] in vocabularyItems: cleanText = entity['Text'].replace('@', '') cleanText = cleanText.replace('.', '') if cleanText: vocabularyItems.append(cleanText) duplicates = find_duplicate_person(speaker_list) for d in duplicates: speaker_list.remove(d) num_speakers = len(speaker_list) # If there is an envelope, the link will point to an audio file if envelope != None: episode_url = envelope.attrib['url'] file_type = envelope.attrib["type"] episode_count += 1 episode = { 'Episode': title.text, 'PodcastName': channelTitle.text, 'podcastUrl': episode_url, 'audioType': file_type, 'tags': keywords, 'speakers': num_speakers, 'speakerNames': speaker_list, 'status': 'PENDING', 'publishedTime': date_string, 'summary': description, 'sourceFeed': feed_url } logger.debug(json.dumps(episode, indent=2)) if "dryrun" in event: episode["dryrun"] = event["dryrun"] # Add this item to the collection retval.append(episode) if max_episodes_to_process is not None and episode_count >= max_episodes_to_process: break # handle errors except HTTPError as e: print("HTTP Error:", e.code, feed_url) raise InvalidInputError("Unable to download RSS feed: " + feed_url) except URLError as e: print("URL Error:", e.reason, feed_url) raise InvalidInputError("Unable to download RSS feed: " + feed_url) logger.info(json.dumps(retval, indent=2)) # This connection can be pretty big and exceed the capacity of the Step Function state data, so we store it # in S3 instead and return a link to the S3 file. s3_client = boto3.client('s3') key = 'podcasts/episodelist/' + id_generator() + '.json' response = s3_client.put_object(Body=json.dumps( { "maxConcurrentEpisodes": maxConcurrentEpisodes, "episodes": retval }, indent=2), Bucket=bucket, Key=key) event['episodes'] = { "status": 'RUNNING', "remainingEpisodes": episode_count, "bucket": bucket, "key": key } event['customVocabulary'] = vocabularyItems # Return the link to the episode JSON document and the custom vocabulary items. return event
break # handle errors except HTTPError, e: print("HTTP Error:", e.code, feed_url) raise InvalidInputError("Unable to download RSS feed: " + feed_url) except URLError, e: print("URL Error:", e.reason, feed_url) raise InvalidInputError("Unable to download RSS feed: " + feed_url) logger.info(json.dumps(retval, indent=2)) # This connection can be pretty big and exceed the capacity of the Step Function state data, so we store it # in S3 instead and return a link to the S3 file. s3_client = boto3.client('s3') key = 'podcasts/episodelist/' + id_generator() + '.json' response = s3_client.put_object(Body=json.dumps( { "maxConcurrentEpisodes": maxConcurrentEpisodes, "episodes": retval }, indent=2), Bucket=bucket, Key=key) event['episodes'] = { "status": 'RUNNING', "remainingEpisodes": episode_count, "bucket": bucket, "key": key }
def process_transcript(transcription_url, podcast_url, vocabulary_info): custom_vocabs = None if "mapping" in vocabulary_info: try: vocab_mapping_bucket = vocabulary_info['mapping']['bucket'] key = vocabulary_info['mapping']['key'] obj = s3_client.get_object(Bucket=vocab_mapping_bucket, Key=key) custom_vocabs = json.loads(obj['Body'].read()) logger.info("key:" + key) logger.info("using custom vocab mapping: \n" + json.dumps(custom_vocabs, indent=2)) except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == "404": raise InvalidInputError( "The S3 file for custom vocab list does not exist.") else: raise # job_status_response = transcribe_client.get_transcription_job(TranscriptionJobName=transcribe_job_id) response = urlopen(transcription_url) output = response.read() json_data = json.loads(output) logger.debug(json.dumps(json_data, indent=4)) results = json_data['results'] # free up memory del json_data comprehend_chunks, paragraphs = chunk_up_transcript(custom_vocabs, results) start = time.time() detected_entities_response = comprehend.batch_detect_entities( TextList=comprehend_chunks, LanguageCode='en') round_trip = time.time() - start logger.info('End of batch_detect_entities. Took time {:10.4f}\n'.format( round_trip)) entities = parse_detected_entities_response(detected_entities_response, {}) entities_as_list = {} for entity_type in entities: entities_as_list[entity_type] = list(entities[entity_type]) clean_up_entity_results(entities_as_list) print(json.dumps(entities_as_list, indent=4)) # start = time.time() # detected_phrase_response = comprehend.batch_detect_key_phrases(TextList=comprehend_chunks, LanguageCode='en') # round_trip = time.time() - start # logger.info('End of batch_detect_key_phrases. Took time {:10.4f}\n'.format(round_trip)) # key_phrases = parse_detected_key_phrases_response(detected_phrase_response) # logger.debug(json.dumps(key_phrases, indent=4)) doc_to_update = {'transcript': paragraphs} doc_to_update['transcript_entities'] = entities_as_list logger.info(json.dumps(doc_to_update, indent=4)) # doc_to_update['key_phrases'] = key_phrases key = 'podcasts/transcript/' + id_generator() + '.json' response = s3_client.put_object(Body=json.dumps(doc_to_update, indent=2), Bucket=bucket, Key=key) logger.info(json.dumps(response, indent=2)) logger.info("successfully written transcript to s3://" + bucket + "/" + key) # Return the bucket and key of the transcription / comprehend result. transcript_location = {"bucket": bucket, "key": key} return transcript_location
def process_transcript(transcription_url): custom_vocabs = None response = urlopen(transcription_url) output = response.read() json_data = json.loads(output) logger.debug(json.dumps(json_data, indent=4)) results = json_data['results'] # free up memory del json_data comprehend_chunks, paragraphs = chunk_up_transcript(custom_vocabs, results) key_phrases = '' entities_as_list = {} if comprehend_chunks is not None and len(comprehend_chunks) > 0: start = time.time() detected_entities_response = comprehend.batch_detect_entities( TextList=comprehend_chunks, LanguageCode=LANGUAGE_CODE) round_trip = time.time() - start logger.info( 'End of batch_detect_entities. Took time {:10.4f}\n'.format( round_trip)) entities = parse_detected_entities_response(detected_entities_response, {}) for entity_type in entities: entities_as_list[entity_type] = list(entities[entity_type]) clean_up_entity_results(entities_as_list) print(json.dumps(entities_as_list, indent=4)) start = time.time() detected_phrase_response = comprehend.batch_detect_key_phrases( TextList=comprehend_chunks, LanguageCode=LANGUAGE_CODE) round_trip = time.time() - start logger.info( 'End of batch_detect_key_phrases. Took time {:10.4f}\n'.format( round_trip)) key_phrases = parse_detected_key_phrases_response( detected_phrase_response) logger.debug(json.dumps(key_phrases, indent=4)) agentTranscript = '' #Agent is channel 1 now... for item in results['channel_labels']['channels'][1]['items']: if item['type'] == 'punctuation': filler = '' else: filler = ' ' agentTranscript += filler + item['alternatives'][0]['content'] customerTranscript = '' # Customer is channel 0 now... for item in results['channel_labels']['channels'][0]['items']: if item['type'] == 'punctuation': filler = '' else: filler = ' ' customerTranscript += filler + item['alternatives'][0]['content'] agent = [agentTranscript] customer = [customerTranscript] agent_entities_as_list = {} detected_agent_phrase_response = '' agent_key_phrases = '' agent_sentiment = '' if agent[0] != '': detected_agent_entities_response = comprehend.batch_detect_entities( TextList=agent[0:24], LanguageCode=LANGUAGE_CODE) round_trip = time.time() - start logger.info( 'End of batch_detect_entities. Took time {:10.4f}\n'.format( round_trip)) agent_entities = parse_detected_entities_response( detected_agent_entities_response, {}) for entity_type in agent_entities: agent_entities_as_list[entity_type] = list( agent_entities[entity_type]) clean_up_entity_results(agent_entities_as_list) print(json.dumps(agent_entities_as_list, indent=4)) start = time.time() detected_agent_phrase_response = comprehend.batch_detect_key_phrases( TextList=agent[0:24], LanguageCode=LANGUAGE_CODE) round_trip = time.time() - start logger.info( 'End of batch_detect_key_phrases. Took time {:10.4f}\n'.format( round_trip)) agent_key_phrases = parse_detected_key_phrases_response( detected_agent_phrase_response) logger.debug(json.dumps(key_phrases, indent=4)) agent_sentiment = comprehend.detect_sentiment( Text=agentTranscript[0:5000], LanguageCode=LANGUAGE_CODE)['Sentiment'] print('agent sentiment ' + agent_sentiment) customer_entities = {} customer_entities_as_list = {} customer_key_phrases = '' customer_sentiment = '' if customer[0] != '': logger.info("CUSTOMER " + json.dumps(customer)) logger.info("CUSTOMER[0:24] " + json.dumps(customer[0:24])) detected_agent_entities_response = comprehend.batch_detect_entities( TextList=customer[0:24], LanguageCode=LANGUAGE_CODE) round_trip = time.time() - start logger.info( 'End of batch_detect_entities. Took time {:10.4f}\n'.format( round_trip)) customer_entities = parse_detected_entities_response( detected_agent_entities_response, {}) for entity_type in customer_entities: customer_entities_as_list[entity_type] = list( customer_entities[entity_type]) clean_up_entity_results(agent_entities_as_list) print(json.dumps(agent_entities_as_list, indent=4)) start = time.time() detected_agent_phrase_response = comprehend.batch_detect_key_phrases( TextList=customer[0:24], LanguageCode=LANGUAGE_CODE) round_trip = time.time() - start logger.info( 'End of batch_detect_key_phrases. Took time {:10.4f}\n'.format( round_trip)) customer_key_phrases = parse_detected_key_phrases_response( detected_agent_phrase_response) logger.debug(json.dumps(key_phrases, indent=4)) customer_sentiment = comprehend.detect_sentiment( Text=customerTranscript[0:5000], LanguageCode=LANGUAGE_CODE)['Sentiment'] print('customer sentiment ' + customer_sentiment) doc_to_update = {'transcript': paragraphs} doc_to_update['agent'] = agentTranscript doc_to_update['customer'] = customerTranscript doc_to_update['transcript_entities'] = entities_as_list doc_to_update['key_phrases'] = key_phrases doc_to_update['agent_key_phrases'] = agent_key_phrases doc_to_update['agent_entities'] = agent_entities_as_list doc_to_update['customer_phrases'] = customer_key_phrases doc_to_update['customer_entities'] = customer_entities_as_list doc_to_update['agent_sentiment'] = agent_sentiment doc_to_update['customer_sentiment'] = customer_sentiment key = 'callrecords/transcript/' + id_generator() + '.json' response = s3_client.put_object(Body=json.dumps(doc_to_update, indent=2), Bucket=bucket, Key=key) logger.info(json.dumps(response, indent=2)) logger.info("successfully written transcript to s3://" + bucket + "/" + key) # Return the bucket and key of the transcription / comprehend result. transcript_location = {"bucket": bucket, "key": key} return transcript_location