예제 #1
0
def process_download_for_extractor(db: DatabaseHandler,
                                   download: dict,
                                   extractor_args: PyExtractorArguments = PyExtractorArguments()) -> None:
    """Extract the download and create the resulting download_text entry. If there are no remaining downloads to be
    extracted for the story, call process_extracted_story() on the parent story."""

    download = decode_object_from_bytes_if_needed(download)

    stories_id = download['stories_id']

    log.debug("extract: {} {} {}".format(download['downloads_id'], stories_id, download['url']))

    extract_and_create_download_text(db=db, download=download, extractor_args=extractor_args)

    has_remaining_download = db.query("""
        SELECT downloads_id
        FROM downloads
        WHERE stories_id = %(stories_id)s
          AND extracted = 'f'
          AND type = 'content'
    """, {'stories_id': stories_id}).hash()

    # MC_REWRITE_TO_PYTHON: Perlism
    if has_remaining_download is None:
        has_remaining_download = {}

    if len(has_remaining_download) > 0:
        log.info("Pending more downloads...")

    else:
        story = db.find_by_id(table='stories', object_id=stories_id)
        process_extracted_story(db=db, story=story, extractor_args=extractor_args)
예제 #2
0
def store_content(db: DatabaseHandler, download: dict, content: str) -> dict:
    """Store the content for the download."""
    # feed_error state indicates that the download was successfull but that there was a problem
    # parsing the feed afterward.  so we want to keep the feed_error state even if we redownload
    # the content
    new_state = 'success' if download['state'] != 'feed_error' else 'feed_error'

    try:
        path = _get_store_for_writing().store_content(db,
                                                      download['downloads_id'],
                                                      content)
        error = ''
    except Exception as e:
        raise McDBIDownloadsException(
            "error while trying to store download %d: %s" %
            (download['downloads_id'], e))
        new_state = 'error'
        error = str(e)
        path = ''

    if new_state == 'success':
        error = ''

    db.update_by_id('downloads', download['downloads_id'], {
        'state': new_state,
        'path': path,
        'error_message': error
    })

    download = db.find_by_id('downloads', download['downloads_id'])

    return download
예제 #3
0
def store_content(db: DatabaseHandler, download: dict, content: str) -> dict:
    """Store the content for the download."""
    # feed_error state indicates that the download was successful but that there was a problem
    # parsing the feed afterward.  so we want to keep the feed_error state even if we redownload
    # the content

    download = decode_object_from_bytes_if_needed(download)
    content = decode_object_from_bytes_if_needed(content)

    new_state = 'success' if download['state'] != 'feed_error' else 'feed_error'

    try:
        path = _get_store_for_writing().store_content(db, download['downloads_id'], content)
    except Exception as ex:
        raise McDBIDownloadsException("error while trying to store download %d: %s" % (download['downloads_id'], ex))

    if new_state == 'success':
        download['error_message'] = ''

    db.update_by_id(
        table='downloads',
        object_id=download['downloads_id'],
        update_hash={'state': new_state, 'path': path, 'error_message': download['error_message']},
    )

    download = db.find_by_id('downloads', download['downloads_id'])

    return download
예제 #4
0
    def store_transcript(cls, db: DatabaseHandler,
                         transcript: Transcript) -> int:
        story = db.find_by_id(table='stories', object_id=transcript.stories_id)

        feed = db.query(
            """
            SELECT *
            FROM feeds
            WHERE feeds_id = (
                SELECT feeds_id
                FROM feeds_stories_map
                WHERE stories_id = %(stories_id)s
            )
        """, {
                'stories_id': transcript.stories_id,
            }).hash()

        download = create_download_for_new_story(db=db, story=story, feed=feed)

        text = cls._download_text_from_transcript(transcript=transcript)

        # Store as a raw download and then let "extract-and-vector" app "extract" the stored text later
        store_content(db=db, download=download, content=text)

        return download['downloads_id']
예제 #5
0
def process_download_for_extractor(db: DatabaseHandler,
                                   download: dict,
                                   extractor_args: PyExtractorArguments = PyExtractorArguments()) -> None:
    """Extract the download and create the resulting download_text entry. If there are no remaining downloads to be
    extracted for the story, call process_extracted_story() on the parent story."""

    download = decode_object_from_bytes_if_needed(download)

    stories_id = download['stories_id']

    log.debug("extract: {} {} {}".format(download['downloads_id'], stories_id, download['url']))

    extract_and_create_download_text(db=db, download=download, extractor_args=extractor_args)

    has_remaining_download = db.query("""
        SELECT downloads_id
        FROM downloads
        WHERE stories_id = %(stories_id)s
          AND extracted = 'f'
          AND type = 'content'
    """, {'stories_id': stories_id}).hash()

    # MC_REWRITE_TO_PYTHON: Perlism
    if has_remaining_download is None:
        has_remaining_download = {}

    if len(has_remaining_download) > 0:
        log.info("Pending more downloads...")

    else:
        story = db.find_by_id(table='stories', object_id=stories_id)
        process_extracted_story(db=db, story=story, extractor_args=extractor_args)
예제 #6
0
def add_story(db: DatabaseHandler, story: dict, feeds_id: int, skip_checking_if_new: bool = False) -> Optional[dict]:
    """If the story is new, add story to the database with the feed of the download as story feed.

    Returns created story or None if story wasn't created.
    """

    story = decode_object_from_bytes_if_needed(story)
    if isinstance(feeds_id, bytes):
        feeds_id = decode_object_from_bytes_if_needed(feeds_id)
    feeds_id = int(feeds_id)
    if isinstance(skip_checking_if_new, bytes):
        skip_checking_if_new = decode_object_from_bytes_if_needed(skip_checking_if_new)
    skip_checking_if_new = bool(int(skip_checking_if_new))

    if db.in_transaction():
        raise McAddStoryException("add_story() can't be run from within transaction.")

    db.begin()

    db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE")

    if not skip_checking_if_new:
        if not is_new(db=db, story=story):
            log.debug("Story '{}' is not new.".format(story['url']))
            db.commit()
            return None

    medium = db.find_by_id(table='media', object_id=story['media_id'])

    if story.get('full_text_rss', None) is None:
        story['full_text_rss'] = medium.get('full_text_rss', False) or False
        if len(story.get('description', '')) == 0:
            story['full_text_rss'] = False

    try:
        story = db.create(table='stories', insert_hash=story)
    except Exception as ex:
        db.rollback()

        # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint
        if 'unique constraint \"stories_guid' in str(ex):
            log.warning(
                "Failed to add story for '{}' to GUID conflict (guid = '{}')".format(story['url'], story['guid'])
            )
            return None

        else:
            raise McAddStoryException("Error adding story: {}\nStory: {}".format(str(ex), str(story)))

    db.find_or_create(
        table='feeds_stories_map',
        insert_hash={
            'stories_id': story['stories_id'],
            'feeds_id': feeds_id,
        }
    )

    db.commit()

    return story
예제 #7
0
def fetch_sitemap_pages_for_media_id(db: DatabaseHandler,
                                     media_id: int) -> None:
    """Fetch and store all pages (news stories or not) from media's sitemap tree."""
    media = db.find_by_id(table='media', object_id=media_id)
    if not media:
        raise Exception("Unable to find media with ID {}".format(media_id))

    media_url = media['url']

    log.info("Fetching sitemap pages for media ID {} ({})...".format(
        media_id, media_url))
    web_client = _SitemapWebClient()
    sitemaps = sitemap_tree_for_homepage(homepage_url=media_url,
                                         web_client=web_client)
    log.info("Fetched pages for media ID {} ({}).".format(media_id, media_url))

    log.info("Storing sitemap pages for media ID {} ({})...".format(
        media_id, media_url))

    insert_counter = 0
    for page in sitemaps.all_pages():
        db.query(
            """
            INSERT INTO media_sitemap_pages (
                media_id, url, last_modified, change_frequency, priority,
                news_title, news_publish_date
            ) VALUES (
                %(media_id)s, %(url)s, %(last_modified)s, %(change_frequency)s, %(priority)s,
                %(news_title)s, %(news_publish_date)s
            )
            ON CONFLICT (url) DO NOTHING
        """, {
                'media_id':
                media_id,
                'url':
                page.url,
                'last_modified':
                page.last_modified,
                'change_frequency':
                page.change_frequency.value
                if page.change_frequency is not None else None,
                'priority':
                page.priority,
                'news_title':
                page.news_story.title if page.news_story is not None else None,
                'news_publish_date':
                page.news_story.publish_date
                if page.news_story is not None else None,
            })

        insert_counter += 1
        if insert_counter % 1000 == 0:
            log.info("Inserted {} URLs...".format(insert_counter))

    log.info("Done storing {} sitemap pages for media ID {} ({}).".format(
        insert_counter, media_id, media_url))
예제 #8
0
    def __init__(self, db: DatabaseHandler, snapshots_id: int):
        super().__init__()

        snapshots_id = int(snapshots_id)

        # Verify that topic exists
        if db.find_by_id(table='snapshots', object_id=snapshots_id) is None:
            raise McWord2vecException("Snapshot with ID %d does not exist." %
                                      snapshots_id)

        self.__snapshots_id = snapshots_id
        self.__sentence_counter = 0

        # Subselect such as:
        #
        #     SELECT sentence
        #     FROM story_sentences
        #     WHERE stories_id IN (
        #         SELECT stories_id
        #         FROM snap.snapshots
        #         WHERE snapshots_id = ...
        #     )
        #
        # or its variants (e.g. INNER JOIN) makes the query planner decide on a sequential scan on "story_sentences",
        # so we create a temporary table with snapshot's "stories_id" first.
        log.info("Creating a temporary table with snapshot's stories_id...")
        snapshots_stories_id_temp_table_name = 'snapshot_stories_ids_{}'.format(
            random_string(32))
        db.query(
            """
            CREATE TEMPORARY TABLE {} AS
                SELECT stories_id
                FROM snap.stories
                WHERE snapshots_id = %(snapshots_id)s
        """.format(snapshots_stories_id_temp_table_name),
            {'snapshots_id': snapshots_id})

        # "INNER JOIN" instead of "WHERE stories_id IN (SELECT ...)" here because then database doesn't have to compute
        # distinct "stories_id" to SELECT sentence FROM story_sentences against, i.e. it doesn't have to
        # Group + HashAggregate on the temporary table.
        log.info("Creating COPY TO object...")
        self.__copy_to = db.copy_to("""
            COPY (
                SELECT story_sentences.sentence
                FROM {} AS snapshot_stories_ids
                    INNER JOIN story_sentences
                        ON snapshot_stories_ids.stories_id = story_sentences.stories_id
            ) TO STDOUT WITH CSV
        """.format(snapshots_stories_id_temp_table_name))
예제 #9
0
def store_content(
    db: DatabaseHandler,
    download: dict,
    content: str,
    amazon_s3_downloads_config: AmazonS3DownloadsConfig = None,
    download_storage_config: DownloadStorageConfig = None,
) -> dict:
    """Store the content for the download."""
    # feed_error state indicates that the download was successful but that there was a problem
    # parsing the feed afterward.  so we want to keep the feed_error state even if we redownload
    # the content

    download = decode_object_from_bytes_if_needed(download)
    content = decode_object_from_bytes_if_needed(content)

    if not amazon_s3_downloads_config:
        amazon_s3_downloads_config = _default_amazon_s3_downloads_config()
    if not download_storage_config:
        download_storage_config = _default_download_storage_config()

    new_state = 'success' if download['state'] != 'feed_error' else 'feed_error'

    try:
        store = _get_store_for_writing(
            amazon_s3_downloads_config=amazon_s3_downloads_config,
            download_storage_config=download_storage_config,
        )
        path = store.store_content(db, download['downloads_id'], content)
    except Exception as ex:
        raise McDBIDownloadsException(
            "error while trying to store download %d: %s" %
            (download['downloads_id'], ex))

    if new_state == 'success':
        download['error_message'] = ''

    db.update_by_id(
        table='downloads',
        object_id=download['downloads_id'],
        update_hash={
            'state': new_state,
            'path': path,
            'error_message': download['error_message']
        },
    )

    download = db.find_by_id('downloads', download['downloads_id'])

    return download
예제 #10
0
    def add_stories_from_feed(self, db: DatabaseHandler, download: dict,
                              content: str) -> List[int]:
        """
        Handle feeds of type 'web_page' by just creating a story to associate with the content.

        Web page feeds are feeds that consist of a web page that we download once a week and add as a story.
        """
        download = decode_object_from_bytes_if_needed(download)
        content = decode_object_from_bytes_if_needed(content)

        feeds_id = download['feeds_id']

        feed = db.find_by_id(table='feeds', object_id=feeds_id)

        title = html_title(html=content, fallback='(no title)')
        title += '[' + sql_now() + ']'

        guid = f"{str(int(time.time()))}:{download['url']}"[0:1024]

        new_story = {
            'url': download['url'],
            'guid': guid,
            'media_id': feed['media_id'],
            'publish_date': sql_now(),
            'title': title,
        }

        story = add_story(db=db, story=new_story, feeds_id=feeds_id)
        if not story:
            raise McCrawlerFetcherSoftError(f"Failed to add story {new_story}")

        db.query(
            """
            UPDATE downloads
            SET stories_id = %(stories_id)s,
                type = 'content'
            WHERE downloads_id = %(downloads_id)s
        """, {
                'stories_id': story['stories_id'],
                'downloads_id': download['downloads_id'],
            })

        # A webpage that was just fetched is also a story
        story_ids = [
            story['stories_id'],
        ]

        return story_ids
예제 #11
0
    def return_stories_to_be_extracted_from_feed(self, db: DatabaseHandler,
                                                 download: dict,
                                                 content: str) -> List[int]:
        download = decode_object_from_bytes_if_needed(download)
        # content = decode_object_from_bytes_if_needed(content)

        # Download row might have been changed by add_stories_from_feed()
        download = db.find_by_id(table='downloads',
                                 object_id=download['downloads_id'])

        # Extract web page download that was just fetched
        stories_to_extract = [
            download['stories_id'],
        ]

        return stories_to_extract
예제 #12
0
    def __init__(self,
                 db: DatabaseHandler,
                 snapshots_id: int,
                 stories_id_chunk_size: int = __DEFAULT_STORIES_ID_CHUNK_SIZE):
        super().__init__()

        snapshots_id = int(snapshots_id)

        self.__db = db
        self.__snapshots_id = snapshots_id
        self.__stories_id_chunk_size = stories_id_chunk_size

        self.__sentences_deque = deque()
        self.__last_encountered_stories_id = 0

        # Verify that the snapshot exists
        if db.find_by_id(table='snapshots', object_id=snapshots_id) is None:
            raise McWord2vecException("Snapshot with ID %d does not exist." % snapshots_id)
예제 #13
0
    def __init__(self,
                 db: DatabaseHandler,
                 snapshots_id: int,
                 stories_id_chunk_size: int = __DEFAULT_STORIES_ID_CHUNK_SIZE):
        super().__init__()

        snapshots_id = int(snapshots_id)

        self.__db = db
        self.__snapshots_id = snapshots_id
        self.__stories_id_chunk_size = stories_id_chunk_size

        self.__sentences_deque = deque()
        self.__last_encountered_stories_id = 0

        # Verify that the snapshot exists
        if db.find_by_id(table='snapshots', object_id=snapshots_id) is None:
            raise McWord2vecException("Snapshot with ID %d does not exist." %
                                      snapshots_id)
예제 #14
0
    def __init__(self, db: DatabaseHandler, snapshots_id: int):
        super().__init__()

        snapshots_id = int(snapshots_id)

        # Verify that topic exists
        if db.find_by_id(table='snapshots', object_id=snapshots_id) is None:
            raise McWord2vecException("Snapshot with ID %d does not exist." % snapshots_id)

        self.__snapshots_id = snapshots_id

        self.__sentence_counter = 0

        log.info("Creating COPY TO object...")
        self.__copy_to = db.copy_to("""
            COPY (
                SELECT story_sentences.sentence
                FROM snap.stories
                    INNER JOIN story_sentences
                        ON snap.stories.stories_id = story_sentences.stories_id
                WHERE snap.stories.snapshots_id = %d
            ) TO STDOUT WITH CSV
        """ % snapshots_id)
예제 #15
0
def store_content(db: DatabaseHandler, download: dict, content: str) -> dict:
    """Store the content for the download."""
    new_state = 'success'
    if download['state'] == 'feed_error':
        new_state = download['state']

    path = ''
    error = ''
    try:
        path = _get_store_for_writing().store_content(db, download['downloads_id'], content)
    except Exception as e:
        raise McDBIDownloadsException("error while trying to store download %d: %s" % (download['download_id'], e))
        new_state = 'error'
        error = str(e)

    if new_state == 'success':
        error = ''

    db.update_by_id('downloads', download['downloads_id'], {'state': new_state, 'path': path, 'error_message': error})

    download = db.find_by_id('downloads', download['downloads_id'])

    return download
예제 #16
0
def handler_for_download(db: DatabaseHandler,
                         download: dict) -> AbstractDownloadHandler:
    """Returns correct handler for download."""

    download = decode_object_from_bytes_if_needed(download)

    downloads_id = int(download['downloads_id'])
    download_type = download['type']

    if download_type == 'feed':
        feeds_id = int(download['feeds_id'])
        feed = db.find_by_id(table='feeds', object_id=feeds_id)
        feed_type = feed['type']

        if feed_type == 'syndicated':
            handler = DownloadFeedSyndicatedHandler()
        elif feed_type == 'web_page':
            handler = DownloadFeedWebPageHandler()
        elif feed_type == 'univision':
            handler = DownloadFeedUnivisionHandler()
        elif feed_type == 'podcast':
            handler = DownloadFeedPodcastHandler()
        else:
            # Unknown feed type is a hard error as we don't types that we don't know about to be there
            raise McCrawlerFetcherHardError(
                f"Unknown feed type '{feed_type}' for feed {feeds_id}, download {downloads_id}"
            )
    elif download_type == 'content':
        handler = DownloadContentHandler()
    else:
        # Unknown download type is a hard error as we don't types that we don't know about to be there
        raise McCrawlerFetcherHardError(
            f"Unknown download type '{download_type}' for download {downloads_id}"
        )

    return handler
예제 #17
0
    def store_download(self, db: DatabaseHandler, download: dict,
                       content: str) -> List[int]:
        download = decode_object_from_bytes_if_needed(download)
        content = decode_object_from_bytes_if_needed(content)

        downloads_id = download['downloads_id']

        log.info(f"Processing feed download {downloads_id}...")

        try:
            added_story_ids = self.add_stories_from_feed(db=db,
                                                         download=download,
                                                         content=content)
            story_ids_to_extract = self.add_stories_from_feed(
                db=db, download=download, content=content)

        except Exception as ex:

            error_message = f"Error processing feed for download {downloads_id}: {ex}"
            log.error(error_message)

            db.query(
                """
                UPDATE downloads
                SET state = 'feed_error',
                    error_message = %(error_message)s
                WHERE downloads_id = %(downloads_id)s
            """, {
                    'error_message': error_message,
                    'downloads_id': downloads_id,
                })

            # On non-soft errors (explicitly hard errors or unknown errors), pass the exception up
            if not isinstance(ex, McCrawlerFetcherSoftError):
                raise ex

            story_ids_to_extract = []

        else:

            if len(added_story_ids):
                last_new_story_time_sql = 'last_new_story_time = last_attempted_download_time, '
            else:
                last_new_story_time_sql = ''

            db.query(
                f"""
                UPDATE feeds
                SET {last_new_story_time_sql}
                    last_successful_download_time = GREATEST(last_successful_download_time, %(download_time)s)
                WHERE feeds_id = %(feeds_id)s
            """, {
                    'download_time': download['download_time'],
                    'feeds_id': download['feeds_id'],
                })

            # If no new stories, just store "(redundant feed)" to save storage space
            if len(added_story_ids) == 0:
                content = '(redundant feed)'

        # Reread the possibly updated download
        download = db.find_by_id(table='downloads', object_id=downloads_id)

        # Store the feed in any case
        store_content(db=db, download=download, content=content)

        log.info(f"Done processing feed download {downloads_id}")

        return story_ids_to_extract
예제 #18
0
def fetch_and_store_episode(
        db: DatabaseHandler,
        stories_id: int,
        config: Optional[PodcastFetchEpisodeConfig] = None) -> None:
    """
    Choose a viable story enclosure for podcast, fetch it, transcode if needed, store to GCS, and record to DB.

    1) Determines the episode's likely language by looking into its title and description, converts the language code to
       BCP 47;
    1) Using enclosures from "story_enclosures", chooses the one that looks like a podcast episode the most;
    2) Fetches the chosen enclosure;
    3) Transcodes the file (if needed) by:
        a) converting it to an audio format that the Speech API can support, and / or
        b) discarding video stream from the media file, and / or
        c) discarding other audio streams from the media file;
    5) Reads the various parameters, e.g. sample rate, of the episode audio file;
    4) Uploads the episode audio file to Google Cloud Storage;
    5) Adds a row to "podcast_episodes".

    Adding a job to submit the newly created episode to Speech API (by adding a RabbitMQ job) is up to the caller.

    :param db: Database handler.
    :param stories_id: Story ID for the story to operate on.
    :param config: (optional) Podcast fetcher configuration object (useful for testing).
    """

    if not config:
        config = PodcastFetchEpisodeConfig()

    story = db.find_by_id(table='stories', object_id=stories_id)
    if not story:
        raise McStoryNotFoundException(f"Story {stories_id} was not found.")

    # Try to determine language of the story
    story_title = story['title']
    story_description = html_strip(story['description'])
    sample_text = f"{story_title}\n{story_description}"

    iso_639_1_language_code = None
    if identification_would_be_reliable(text=sample_text):
        iso_639_1_language_code = language_code_for_text(text=sample_text)

    if not iso_639_1_language_code:
        iso_639_1_language_code = 'en'

    # Convert to BCP 47 identifier
    bcp_47_language_code = iso_639_1_code_to_bcp_47_identifier(
        iso_639_1_code=iso_639_1_language_code,
        url_hint=story['url'],
    )

    # Find the enclosure that might work the best
    best_enclosure = podcast_viable_enclosure_for_story(db=db,
                                                        stories_id=stories_id)
    if not best_enclosure:
        raise McPodcastNoViableStoryEnclosuresException(
            f"There were no viable enclosures found for story {stories_id}")

    if best_enclosure.length:
        if best_enclosure.length > MAX_ENCLOSURE_SIZE:
            raise McPodcastEnclosureTooBigException(
                f"Chosen enclosure {best_enclosure} is too big.")

    try:
        temp_dir = tempfile.mkdtemp('fetch_and_store')
    except Exception as ex:
        raise McPodcastFileStoreFailureException(
            f"Unable to create temporary directory: {ex}")

    # Fetch enclosure
    input_filename = 'input_file'
    input_file_path = os.path.join(temp_dir, input_filename)
    log.info(f"Fetching enclosure {best_enclosure} to {input_file_path}...")
    fetch_big_file(url=best_enclosure.url,
                   dest_file=input_file_path,
                   max_size=MAX_ENCLOSURE_SIZE)
    log.info(f"Done fetching enclosure {best_enclosure} to {input_file_path}")

    if os.stat(input_file_path).st_size == 0:
        # Might happen with misconfigured webservers
        raise McPodcastFileFetchFailureException(
            f"Fetched file {input_file_path} is empty.")

    # Transcode if needed
    input_file_obj = TranscodeTempDirAndFile(temp_dir=temp_dir,
                                             filename=input_filename)
    transcoded_file_obj = transcode_media_file_if_needed(
        input_media_file=input_file_obj)

    # Unset the variable so that we don't accidentally use it later
    del input_filename, temp_dir

    if input_file_obj != transcoded_file_obj:
        # Function did some transcoding and stored everything in yet another file

        # Remove the input file
        _cleanup_temp_dir(temp=input_file_obj)

        # Consider the transcoded file the new input file
        input_file_obj = transcoded_file_obj

    # (Re)read the properties of either the original or the transcoded file
    media_info = media_file_info(media_file_path=input_file_obj.temp_full_path)
    best_audio_stream = media_info.best_supported_audio_stream()

    # Store input file to GCS
    try:
        gcs = GCSStore(config=config)
        gcs_uri = gcs.store_object(
            local_file_path=input_file_obj.temp_full_path,
            object_id=str(stories_id),
            mime_type=best_audio_stream.audio_codec_class.mime_type(),
        )

    except Exception as ex:

        log.error(
            f"Unable to store episode file '{input_file_obj.temp_full_path}' for story {stories_id}: {ex}"
        )

        # Clean up, then raise further
        _cleanup_temp_dir(temp=input_file_obj)

        raise ex

    # Clean up the locally stored file as we don't need it anymore
    _cleanup_temp_dir(temp=input_file_obj)

    # Insert everything to the database
    try:
        db.query(
            """
            INSERT INTO podcast_episodes (
                stories_id,
                story_enclosures_id,
                gcs_uri,
                duration,
                codec,
                sample_rate,
                bcp47_language_code
            ) VALUES (
                %(stories_id)s,
                %(story_enclosures_id)s,
                %(gcs_uri)s,
                %(duration)s,
                %(codec)s,
                %(sample_rate)s,
                %(bcp47_language_code)s            
            ) ON CONFLICT (stories_id) DO UPDATE SET
                story_enclosures_id = %(story_enclosures_id)s,
                gcs_uri = %(gcs_uri)s,
                duration = %(duration)s,
                codec = %(codec)s,
                sample_rate = %(sample_rate)s,
                bcp47_language_code = %(bcp47_language_code)s
        """, {
                'stories_id':
                stories_id,
                'story_enclosures_id':
                best_enclosure.story_enclosures_id,
                'gcs_uri':
                gcs_uri,
                'duration':
                best_audio_stream.duration,
                'codec':
                best_audio_stream.audio_codec_class.postgresql_enum_value(),
                'sample_rate':
                best_audio_stream.sample_rate,
                'bcp47_language_code':
                bcp_47_language_code,
            })

    except Exception as ex_db:

        # Try to delete object on GCS first
        try:
            gcs.delete_object(object_id=str(stories_id))
        except Exception as ex_gcs:
            # We should be able to delete it as we've just uploaded it
            raise McPodcastGCSStoreFailureException((
                f"Unable to clean up story's {stories_id} audio file from GCS after database insert failure; "
                f"database insert exception: {ex_db}; "
                f"GCS exception: {ex_gcs}"))

        raise McPodcastPostgreSQLException(
            f"Failed inserting episode for story {stories_id}: {ex_db}")
예제 #19
0
    def fetch_transcript(
            cls, db: DatabaseHandler,
            podcast_episode_transcript_fetches_id: int
    ) -> Optional[Transcript]:
        transcript_fetch = db.find_by_id(
            table='podcast_episode_transcript_fetches',
            object_id=podcast_episode_transcript_fetches_id,
        )
        if not transcript_fetch:
            raise McDatabaseNotFoundException(
                f"Unable to find transcript fetch with ID {podcast_episode_transcript_fetches_id}"
            )
        podcast_episodes_id = transcript_fetch['podcast_episodes_id']

        episode = db.find_by_id(table='podcast_episodes',
                                object_id=podcast_episodes_id)
        if not episode:
            raise McDatabaseNotFoundException(
                f"Unable to find podcast episode with ID {podcast_episodes_id}"
            )

        stories_id = episode['stories_id']
        speech_operation_id = episode['speech_operation_id']

        if not speech_operation_id:
            raise McMisconfiguredSpeechAPIException(
                f"Speech ID for podcast episode {podcast_episodes_id} is unset."
            )

        try:
            config = PodcastFetchTranscriptConfig()
            client = SpeechClient.from_service_account_json(
                config.gc_auth_json_file())
            operations_client = OperationsClient(
                channel=client.transport.channel)
        except Exception as ex:
            raise McMisconfiguredSpeechAPIException(
                f"Unable to initialize Speech API operations client: {ex}")

        try:
            operation = operations_client.get_operation(
                name=speech_operation_id)
        except InvalidArgument as ex:
            raise McMisconfiguredSpeechAPIException(
                f"Invalid operation ID '{speech_operation_id}': {ex}")
        except NotFound as ex:
            raise McOperationNotFoundException(
                f"Operation ID '{speech_operation_id}' was not found: {ex}")
        except Exception as ex:
            # On any other errors, raise a hard exception
            raise McMisconfiguredSpeechAPIException(
                f"Error while fetching operation ID '{speech_operation_id}': {ex}"
            )

        if not operation:
            raise McMisconfiguredSpeechAPIException(f"Operation is unset.")

        try:
            gapic_operation: Operation = from_gapic(
                operation,
                operations_client,
                cloud_speech_pb2.LongRunningRecognizeResponse,
                metadata_type=cloud_speech_pb2.LongRunningRecognizeMetadata,
            )
        except Exception as ex:
            raise McMisconfiguredSpeechAPIException(
                f"Unable to create GAPIC operation: {ex}")

        log.debug(f"GAPIC operation: {gapic_operation}")
        log.debug(f"Operation metadata: {gapic_operation.metadata}")
        log.debug(f"Operation is done: {gapic_operation.done()}")
        log.debug(f"Operation error: {gapic_operation.done()}")

        try:
            operation_is_done = gapic_operation.done()
        except Exception as ex:
            # 'done' attribute might be gone in a newer version of the Speech API client
            raise McMisconfiguredSpeechAPIException(
                f"Unable to test whether operation '{speech_operation_id}' is done: {ex}"
            )

        if not operation_is_done:
            log.info(f"Operation '{speech_operation_id}' is still not done.")
            return None

        utterances = []

        try:
            for result in gapic_operation.result().results:

                alternatives = []
                for alternative in result.alternatives:
                    alternatives.append(
                        UtteranceAlternative(
                            text=alternative.transcript.strip(),
                            confidence=alternative.confidence,
                        ))

                utterances.append(
                    Utterance(
                        alternatives=alternatives,
                        bcp47_language_code=result.language_code,
                    ))

        except GoogleAPICallError as ex:
            raise McTranscriptionReturnedErrorException(
                f"Unable to read transcript for operation '{speech_operation_id}': {ex}"
            )

        except Exception as ex:
            raise McMisconfiguredSpeechAPIException(
                f"Unable to read transcript for operation '{speech_operation_id}': {ex}"
            )

        return Transcript(stories_id=stories_id, utterances=utterances)
예제 #20
0
def add_story(db: DatabaseHandler,
              story: dict,
              feeds_id: int,
              skip_checking_if_new: bool = False) -> Optional[dict]:
    """If the story is new, add story to the database with the feed of the download as story feed.

    Returns created story or None if story wasn't created.
    """

    story = decode_object_from_bytes_if_needed(story)
    if isinstance(feeds_id, bytes):
        feeds_id = decode_object_from_bytes_if_needed(feeds_id)
    feeds_id = int(feeds_id)
    if isinstance(skip_checking_if_new, bytes):
        skip_checking_if_new = decode_object_from_bytes_if_needed(
            skip_checking_if_new)
    skip_checking_if_new = bool(int(skip_checking_if_new))

    if db.in_transaction():
        raise McAddStoryException(
            "add_story() can't be run from within transaction.")

    db.begin()

    db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE")

    if not skip_checking_if_new:
        if not is_new(db=db, story=story):
            log.debug("Story '{}' is not new.".format(story['url']))
            db.commit()
            return None

    medium = db.find_by_id(table='media', object_id=story['media_id'])

    if story.get('full_text_rss', None) is None:
        story['full_text_rss'] = medium.get('full_text_rss', False) or False
        if len(story.get('description', '')) == 0:
            story['full_text_rss'] = False

    try:
        story = db.create(table='stories', insert_hash=story)
    except Exception as ex:
        db.rollback()

        # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint
        if 'unique constraint \"stories_guid' in str(ex):
            log.warning(
                "Failed to add story for '{}' to GUID conflict (guid = '{}')".
                format(story['url'], story['guid']))
            return None

        else:
            raise McAddStoryException(
                "Error adding story: {}\nStory: {}".format(
                    str(ex), str(story)))

    db.find_or_create(table='feeds_stories_map',
                      insert_hash={
                          'stories_id': story['stories_id'],
                          'feeds_id': feeds_id,
                      })

    db.commit()

    return story
예제 #21
0
def add_story(db: DatabaseHandler, story: dict,
              feeds_id: int) -> Optional[dict]:
    """Return an existing dup story if it matches the url, guid, or title; otherwise, add a new story and return it.

    Returns found or created story. Adds an is_new = True story if the story was created by the call.
    """

    story = decode_object_from_bytes_if_needed(story)
    if isinstance(feeds_id, bytes):
        feeds_id = decode_object_from_bytes_if_needed(feeds_id)
    feeds_id = int(feeds_id)

    if db.in_transaction():
        raise McAddStoryException(
            "add_story() can't be run from within transaction.")

    db.begin()

    db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE")

    db_story = find_dup_story(db, story)
    if db_story:
        log.debug("found existing dup story: %s [%s]" %
                  (story['title'], story['url']))
        db.commit()
        return db_story

    medium = db.find_by_id(table='media', object_id=story['media_id'])

    if story.get('full_text_rss', None) is None:
        story['full_text_rss'] = medium.get('full_text_rss', False) or False
        if len(story.get('description', '')) == 0:
            story['full_text_rss'] = False

    try:
        story = db.create(table='stories', insert_hash=story)
    except Exception as ex:
        db.rollback()

        # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint
        if 'unique constraint \"stories_guid' in str(ex):
            log.warning(
                "Failed to add story for '{}' to GUID conflict (guid = '{}')".
                format(story['url'], story['guid']))
            return None

        else:
            raise McAddStoryException(
                "Error adding story: {}\nStory: {}".format(
                    str(ex), str(story)))

    story['is_new'] = True

    [insert_story_urls(db, story, u) for u in (story['url'], story['guid'])]

    # on conflict does not work with partitioned feeds_stories_map
    db.query(
        """
        insert into feeds_stories_map_p ( feeds_id, stories_id )
            select %(a)s, %(b)s where not exists (
                select 1 from feeds_stories_map where feeds_id = %(a)s and stories_id = %(b)s )
        """, {
            'a': feeds_id,
            'b': story['stories_id']
        })

    db.commit()

    log.debug("added story: %s" % story['url'])

    return story
예제 #22
0
def add_story(db: DatabaseHandler, story: dict,
              feeds_id: int) -> Optional[dict]:
    """Return an existing dup story if it matches the url, guid, or title; otherwise, add a new story and return it.

    Returns found or created story. Adds an is_new = True story if the story was created by the call.
    """

    story = decode_object_from_bytes_if_needed(story)
    if isinstance(feeds_id, bytes):
        feeds_id = decode_object_from_bytes_if_needed(feeds_id)
    feeds_id = int(feeds_id)

    # PostgreSQL is not a fan of NULL bytes in strings
    for key in story.keys():
        if isinstance(story[key], str):
            story[key] = story[key].replace('\x00', '')

    medium = db.find_by_id(table='media', object_id=story['media_id'])

    if story.get('full_text_rss', None) is None:
        story['full_text_rss'] = medium.get('full_text_rss', False) or False

        # Description can be None
        if not story.get('description', None):
            story['full_text_rss'] = False

    if len(story['url']) >= MAX_URL_LENGTH:
        log.error(f"Story's URL is too long: {story['url']}")
        return None

    db_stories = _find_dup_stories(db, story)
    if db_stories:
        first_story = db_stories[0]
        log.debug(
            f"Found one or more duplicate stories: {first_story['title']} [{first_story['url']}]"
        )
        return first_story

    # After sharding stories.guid no longer can have a UNIQUE index so we can no longer do an atomic upsert, and
    # pre-atomic PostgreSQL upserts (INSERT INTO ... SELECT ... WHERE NOT EXISTS) have race conditions. So instead here
    # we insert a new row, check for "duplicate stories" again, find out how many we have, and if we have more than one
    # (i.e. something managed to get inserted while we were doing our own insert), we get rid of the row that we've just
    # added
    try:
        inserted_story = db.create(table='stories', insert_hash=story)
    except Exception as ex:
        raise McAddStoryException(
            f"Error while adding story: {ex}\nStory: {story}")

    db_stories = db.query(
        """
        SELECT *
        FROM stories
        WHERE
            (guid = ANY(%(urls)s) OR url = ANY(%(urls)s)) AND
            media_id = %(media_id)s
        ORDER BY stories_id
    """, {
            'urls': _get_story_url_variants(story),
            'media_id': story['media_id'],
        }).hashes()

    if len(db_stories) == 0:
        raise McAddStoryException(
            f"Story got added but we can't find it now; story: {story}")

    elif len(db_stories) == 1:
        story = inserted_story
        story['is_new'] = True

    elif len(db_stories) > 1:
        db.query(
            """
            DELETE FROM stories
            WHERE stories_id = %(stories_id)s
        """, {
                'stories_id': inserted_story['stories_id'],
            })
        story = db_stories[0]

    [insert_story_urls(db, story, u) for u in (story['url'], story['guid'])]

    db.query(
        """
        INSERT INTO feeds_stories_map (feeds_id, stories_id)
        VALUES (%(feeds_id)s, %(stories_id)s)
        ON CONFLICT (feeds_id, stories_id) DO NOTHING
        """, {
            'feeds_id': feeds_id,
            'stories_id': story['stories_id'],
        })

    log.debug(f"Added story: {story['url']}")

    return story