def clean_up_entity_results(entities_as_list):
    if 'PERSON' in entities_as_list:
        try:
            people = entities_as_list['PERSON']
            duplicates = find_duplicate_person(people)
            for d in duplicates:
                people.remove(d)
            entities_as_list['PERSON'] = people
        except Exception as e:
            logger.error(e)
    if 'COMMERCIAL_ITEM' in entities_as_list:
        entities_as_list['Products_and_Titles'] = entities_as_list['COMMERCIAL_ITEM']
        del entities_as_list['COMMERCIAL_ITEM']
    if 'TITLE' in entities_as_list:
        if 'PRODUCTS / TTTLES' in entities_as_list:
            entities_as_list['Products_and_Titles'].append(entities_as_list['TITLE'])
        else:
            entities_as_list['Products_and_Titles'] = entities_as_list['TITLE']
        del entities_as_list['TITLE']
示例#2
0
def lambda_handler(event, context):
    logger.info("Received event: " + json.dumps(event, indent=2))
    feed_url = event['rss']
    max_episodes_to_process = None
    if 'maxEpisodesToProcess' in event:
        max_episodes_to_process = int(event['maxEpisodesToProcess'])

    maxConcurrentEpisodes = 10

    # Open the url and process the RSS feed
    retval = []
    bucket = os.environ['BUCKET_NAME']

    episode_count = 0

    # This array holds the entity types that are included in the custom vocabulary
    vocabularyTypes = [
        'COMMERCIAL_ITEM', 'EVENT', 'LOCATION', 'ORGANIZATION', 'TITLE'
    ]
    vocabularyItems = []

    try:
        filename = '/tmp/' + id_generator() + '.rss'
        # HTTP GET the RSS feed XML file
        f = urlopen(feed_url)

        # Open our local file for writing
        with open(filename, "wb") as local_file:
            local_file.write(f.read())

        # The RSS feed is an XML file, so parse it and traverse the tree and pull all the /channel/items
        tree = ET.parse(filename)
        root = tree.getroot()

        # Extract the title of the podcast
        channelTitle = root.find('channel/title')

        for child in root.findall('channel/item'):
            title = child.find('title')
            envelope = child.find('enclosure')

            date_entry = child.find('pubDate').text
            dt = parser.parse(date_entry)
            date_string = dt.strftime("%Y:%m:%d %H:%M:%S")

            keywords = []

            description = child.find('description').text
            description = description[0:4900]

            comprehendResponse = client.detect_entities(Text=description,
                                                        LanguageCode='en')

            # we estimate the number of speakers in the podcast by parsing people names from the episode summary
            speaker_list = []
            for i in range(len(comprehendResponse["Entities"])):
                entity = comprehendResponse["Entities"][i]

                # For every person mentioned in the description, increment the number of
                # speakers. This is making the assumption that the episode text will
                # mention all the speakers and not include mentions to people that
                # are not in the podcast.
                # Is isn't critical that this number is correct, it is simply used to break
                # up the body of the podcast into smaller chunks. If the speaker detection
                # is inaccurate, it doesn't have a major impact on the functionality of
                # the system.
                if entity['Type'] == 'PERSON':
                    if not entity['Text'].startswith('@'):
                        speaker_list.append(entity['Text'])
                    else:
                        logger.info(f'skipping person {entity["Text"]}')
                # add to vocabulary if not already in there
                if entity['Type'] in vocabularyTypes and not entity[
                        'Text'] in vocabularyItems:
                    cleanText = entity['Text'].replace('@', '')
                    cleanText = cleanText.replace('.', '')
                    if cleanText:
                        vocabularyItems.append(cleanText)

            duplicates = find_duplicate_person(speaker_list)
            for d in duplicates:
                speaker_list.remove(d)
            num_speakers = len(speaker_list)

            # If there is an envelope, the link will point to an audio file
            if envelope != None:
                episode_url = envelope.attrib['url']
                file_type = envelope.attrib["type"]
                episode_count += 1

                episode = {
                    'Episode': title.text,
                    'PodcastName': channelTitle.text,
                    'podcastUrl': episode_url,
                    'audioType': file_type,
                    'tags': keywords,
                    'speakers': num_speakers,
                    'speakerNames': speaker_list,
                    'status': 'PENDING',
                    'publishedTime': date_string,
                    'summary': description,
                    'sourceFeed': feed_url
                }

                logger.debug(json.dumps(episode, indent=2))

                if "dryrun" in event:
                    episode["dryrun"] = event["dryrun"]
                # Add this item to the collection
                retval.append(episode)

            if max_episodes_to_process is not None and episode_count >= max_episodes_to_process:
                break

    # handle errors
    except HTTPError as e:
        print("HTTP Error:", e.code, feed_url)
        raise InvalidInputError("Unable to download RSS feed: " + feed_url)
    except URLError as e:
        print("URL Error:", e.reason, feed_url)
        raise InvalidInputError("Unable to download RSS feed: " + feed_url)

    logger.info(json.dumps(retval, indent=2))

    # This connection can be pretty big and exceed the capacity of the Step Function state data, so we store it
    # in S3 instead and return a link to the S3 file.
    s3_client = boto3.client('s3')
    key = 'podcasts/episodelist/' + id_generator() + '.json'
    response = s3_client.put_object(Body=json.dumps(
        {
            "maxConcurrentEpisodes": maxConcurrentEpisodes,
            "episodes": retval
        },
        indent=2),
                                    Bucket=bucket,
                                    Key=key)

    event['episodes'] = {
        "status": 'RUNNING',
        "remainingEpisodes": episode_count,
        "bucket": bucket,
        "key": key
    }
    event['customVocabulary'] = vocabularyItems

    # Return the link to the episode JSON document and the custom vocabulary items.
    return event