Python fetch_content示例，mediawords.dbi.downloads.store.fetch_content Python示例

示例#1

0

显示文件

文件： stories.py 项目： sagar-joshi/backend

def merge_foreign_rss_stories(db: DatabaseHandler, topic: dict) -> None:
    """Move all topic stories with a foreign_rss_links medium from topic_stories back to topic_seed_urls."""
    topic = decode_object_from_bytes_if_needed(topic)

    stories = db.query(
        """
        select s.*
        from stories s, topic_stories ts, media m
        where s.stories_id = ts.stories_id
          and s.media_id = m.media_id
          and m.foreign_rss_links = true
          and ts.topics_id = %(a)s
          and not ts.valid_foreign_rss_story
        """, {
            'a': topic['topics_id']
        }).hashes()

    for story in stories:
        download = db.query(
            "select * from downloads where stories_id = %(a)s order by downloads_id limit 1",
            {
                'a': story['stories_id']
            }).hash()

        content = ''
        try:
            content = fetch_content(db, download)
        except Exception as ex:
            log.warning(
                f"Unable to fetch content for download {download['downloads_id']}: {ex}"
            )

        # postgres will complain if the content has a null in it
        content = content.replace('\x00', '')

        db.begin()
        db.create(
            'topic_seed_urls', {
                'url': story['url'],
                'topics_id': topic['topics_id'],
                'source': 'merge_foreign_rss_stories',
                'content': content
            })

        db.query(
            """
            update topic_links set ref_stories_id = null, link_spidered = 'f'
                where topics_id = %(b)s and ref_stories_id = %(a)s
            """, {
                'a': story['stories_id'],
                'b': topic['topics_id']
            })

        db.query(
            "delete from topic_stories where stories_id = %(a)s and topics_id = %(b)s",
            {
                'a': story['stories_id'],
                'b': topic['topics_id']
            })
        db.commit()

示例#2

0

显示文件

def extract(db: DatabaseHandler, download: dict, extractor_args: PyExtractorArguments = PyExtractorArguments()) -> dict:
    """Extract the content for the given download.

    Arguments:
    db - db handle
    download - download dict from db
    use_cache - get and set results in extractor cache

    Returns:
    see extract_content() below

    """
    download = decode_object_from_bytes_if_needed(download)

    downloads_id = download['downloads_id']

    if extractor_args.use_cache():
        log.debug("Fetching cached extractor results for download {}...".format(downloads_id))
        results = _get_extractor_results_cache(db, download)
        if results is not None:
            return results

    log.debug("Fetching content for download {}...".format(downloads_id))
    content = fetch_content(db, download)

    log.debug("Extracting {} characters of content for download {}...".format(len(content), downloads_id))
    results = extract_content(content)
    log.debug(
        "Done extracting {} characters of content for download {}.".format(len(content), downloads_id))

    if extractor_args.use_cache():
        log.debug("Caching extractor results for download {}...".format(downloads_id))
        _set_extractor_results_cache(db, download, results)

    return results

示例#3

0

显示文件

文件： extract_story_links.py 项目： auladinglese/mediacloud

def _get_extracted_html(db: DatabaseHandler, story: dict) -> str:
    """Get the extracted html for the story.

    We don't store the extracted html of a story, so we have to get the first download assoicated with the story
    and run the extractor on it.

    """
    download = db.query(
        """
        with d as (
            select * from downloads
                where
                    stories_id = %(a)s and
                    type = 'content' and
                    state = 'success'
        ) -- goofy cte to avoid bad query plan

        select * from d order by downloads_id limit 1
        """, {
            'a': story['stories_id']
        }).hash()

    if not download:
        return ''

    html = fetch_content(db, download)

    extract = extract_article_html_from_page_html(html)
    extracted_html = extract['extracted_html']

    return extracted_html

示例#4

0

显示文件

def test_generate_story():
    """Test generate_story()."""
    db = connect_to_db()

    story_content = '<title>foo bar</title><meta content="2016-01-12T03:55:46Z" itemprop="datePublished"/>'
    story_url = 'http://foo.com/foo/bar'
    story = generate_story(db=db, url=story_url, content=story_content)

    assert 'stories_id' in story
    assert story['title'] == 'foo bar'
    assert story['publish_date'] == '2016-01-12 03:55:46'
    assert story['url'] == story_url
    assert story['guid'] == story_url

    medium = db.require_by_id('media', story['media_id'])

    assert medium['name'] == 'foo.com'
    assert medium['url'] == 'http://foo.com/'

    feed = db.query(
        "select f.* from feeds f join feeds_stories_map fsm using ( feeds_id ) where stories_id = %(a)s",
        {
            'a': story['stories_id']
        }).hash()

    assert feed is not None
    assert feed['name'] == SPIDER_FEED_NAME

    (date_tag, date_tag_set) = get_story_date_tag(db, story)

    assert date_tag['tag'] == 'guess_by_tag_meta'
    assert date_tag_set['name'] == GUESS_METHOD_TAG_SET

    download = db.query("select * from downloads where stories_id = %(a)s", {
        'a': story['stories_id']
    }).hash()

    assert download is not None
    assert download['url'] == story['url']

    content = fetch_content(db, download)

    assert content == story_content

    story = generate_story(
        db=db,
        url='http://fallback.date',
        content='foo',
        fallback_date='2011-11-11',
    )

    assert story['publish_date'] == '2011-11-11 00:00:00'

    matched_story = generate_story(db, story['url'], 'foo')
    assert matched_story['stories_id'] == story['stories_id']

    story = generate_story(db=db, url='invalid url', content='foo')

    assert story is not None

示例#5

0

显示文件

文件： test_copy_story_to_new_medium.py 项目： vishalbelsare/mediacloud

def test_copy_story_to_new_medium():
    """Test copy_story_to_new_medium."""
    db = connect_to_db()

    topic = create_test_topic(db, 'copy foo')

    new_medium = create_test_medium(db, 'copy new')

    old_medium = create_test_medium(db, 'copy old')
    old_feed = create_test_feed(db=db, label='copy old', medium=old_medium)
    old_story = create_test_story(db=db, label='copy old', feed=old_feed)

    add_content_to_test_story(db, old_story, old_feed)

    add_to_topic_stories(db, old_story, topic)

    new_story = copy_story_to_new_medium(db, topic, old_story, new_medium)

    assert db.find_by_id('stories', new_story['stories_id']) is not None

    for field in 'title url guid publish_date'.split():
        assert old_story[field] == new_story[field]

    topic_story_exists = db.query("""
        SELECT *
        FROM topic_stories
        WHERE
            topics_id = %(topics_id)s AND
            stories_id = %(stories_id)s
    """, {
        'topics_id': topic['topics_id'],
        'stories_id': new_story['stories_id'],
    }).hash()
    assert topic_story_exists is not None

    new_download = db.query("""
        SELECT *
        FROM downloads
        WHERE stories_id = %(stories_id)s
    """, {
        'stories_id': new_story['stories_id'],
    }).hash()
    assert new_download is not None

    content = fetch_content(db, new_download)
    assert content is not None and len(content) > 0

    story_sentences = db.query("""
        SELECT *
        FROM story_sentences
        WHERE stories_id = %(stories_id)s
    """, {
        'stories_id': new_story['stories_id'],
    }).hashes()
    assert len(story_sentences) > 0

示例#6

0

显示文件

文件： stories.py 项目： vishalbelsare/mediacloud

def store_and_verify_content(db: DatabaseHandler, download: dict, content: str) -> None:
    """Call store content and then poll verifying that the content has been stored.

    Only return once we have verified that the content has been stored.  Raise an error after a
    timeout if the content is not found.  It seems like S3 content is not available for fetching until a small
    delay after writing it.  This function makes sure the content is there once the store operation is done.
    """
    store_content(db, download, content)

    tries = 0
    while True:
        try:
            fetch_content(db, download)
            break
        except Exception as e:
            if tries > STORE_CONTENT_TIMEOUT:
                raise e

            log.debug(f"story_and_verify_content: waiting to retry verification ({tries}) ...")
            tries += 1
            time.sleep(1)

示例#7

0

显示文件

文件： extract_story_links.py 项目： vishalbelsare/mediacloud

def _get_youtube_embed_links(db: DatabaseHandler, story: dict) -> List[str]:
    """Parse youtube embedded video urls out of the full html of the story.

    This function looks for youtube embed links anywhere in the html of the story content, rather than just in the
    extracted html.  It aims to return a superset of all youtube embed links by returning every iframe src= attribute
    that includes the string 'youtube'.

    Arguments:
    db - db handle
    story - story dict from db

    Returns:
    list of string urls

    """
    download = db.query(
        """
        SELECT *
        FROM downloads
        WHERE stories_id = %(stories_id)s
        ORDER BY stories_id
        LIMIT 1
    """, {
            'stories_id': story['stories_id'],
        }).hash()

    html = fetch_content(db, download)

    soup = BeautifulSoup(html, 'lxml')

    links = []
    for tag in soup.find_all('iframe', src=True):
        url = tag['src']

        if 'youtube' not in url:
            continue

        if not url.lower().startswith('http'):
            url = 'http:' + url

        url = url.strip()

        url = url.replace('youtube-embed', 'youtube')

        links.append(url)

    return links

示例#8

0

显示文件

文件： test_fetch_content.py 项目： vishalbelsare/mediacloud

    def test_fetch_content(self) -> None:
        """Test fetch_content by manually storing using the PostgreSQL store and then trying to fetch it."""
        db = self._db
        with self.assertRaises(McDBIDownloadsException):
            fetch_content(db=db, download={})

        with self.assertRaises(McDBIDownloadsException):
            fetch_content(db=db,
                          download={
                              'downloads_id': 1,
                              'state': 'error'
                          })

        amazon_s3_downloads_config = _default_amazon_s3_downloads_config()

        class DoNotReadAllFromS3DownloadStorageConfig(DownloadStorageConfig):
            @staticmethod
            def read_all_from_s3():
                return False

            @staticmethod
            def fallback_postgresql_to_s3():
                return False

        store = _get_store_for_reading(
            download=self.test_download,
            amazon_s3_downloads_config=amazon_s3_downloads_config,
            download_storage_config=DoNotReadAllFromS3DownloadStorageConfig(),
        )

        content = 'foo bar'
        store.store_content(db=db,
                            object_id=self.test_download['downloads_id'],
                            content=content)
        got_content = fetch_content(
            db=db,
            download=self.test_download,
            download_storage_config=DoNotReadAllFromS3DownloadStorageConfig(),
        )
        assert got_content == content

        content = b'foo bar'
        store.store_content(db=db,
                            object_id=self.test_download['downloads_id'],
                            content=content)
        got_content = fetch_content(
            db=db,
            download=self.test_download,
            download_storage_config=DoNotReadAllFromS3DownloadStorageConfig(),
        )
        assert got_content == content.decode()

示例#9

0

显示文件

文件： extract_story_links.py 项目： vishalbelsare/mediacloud

def _get_extracted_html(db: DatabaseHandler, story: dict) -> str:
    """Get the extracted html for the story.

    We don't store the extracted html of a story, so we have to get the first download assoicated with the story
    and run the extractor on it.

    """
    download = db.query(
        """
        WITH d AS (
            SELECT *
            FROM downloads
            WHERE
                stories_id = %(stories_id)s AND
                type = 'content' AND
                state = 'success'
        ) -- goofy cte to avoid bad query plan

        SELECT *
        FROM d
        ORDER BY downloads_id
        LIMIT 1
    """, {
            'stories_id': story['stories_id'],
        }).hash()

    if not download:
        return ''

    html = fetch_content(db, download)

    # avoid extracting large binary files
    if '<' not in html[0:1000]:
        if 'http' in html:
            return html[0:1000000]
        else:
            return ''

    extract = extract_content(html)
    extracted_html = extract['extracted_html']

    return extracted_html

示例#10

0

显示文件

文件： test_fetch_store_full_chain.py 项目： sagar-joshi/backend

    def test_full_chain(self):
        transcript = None

        handler = DefaultHandler()

        for x in range(1, 60 + 1):
            log.info(f"Waiting for transcript to be finished (#{x})...")

            podcast_episode_transcript_fetches_id = self.transcript_fetches[0][
                'podcast_episode_transcript_fetches_id']
            transcript = handler.fetch_transcript(
                db=self.db,
                podcast_episode_transcript_fetches_id=
                podcast_episode_transcript_fetches_id)
            if transcript:
                log.info("Transcript is here!")
                break

            time.sleep(2)

        assert transcript
        assert transcript.stories_id
        assert len(transcript.utterances) == 1
        assert len(transcript.utterances[0].alternatives) == 1
        assert 'kim kardashian' in transcript.utterances[0].alternatives[
            0].text.lower()

        downloads_id = handler.store_transcript(db=self.db,
                                                transcript=transcript)

        download = self.db.find_by_id(table='downloads',
                                      object_id=downloads_id)

        raw_download = fetch_content(db=self.db, download=download)
        assert raw_download
        assert 'kim kardashian' in raw_download.lower()

示例#11

0

显示文件

文件： stories.py 项目： rleir/mediacloud

def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict,
                             new_medium: dict) -> dict:
    """Copy story to new medium.

    Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on.
    Return the new story.
    """

    story = {
        'url': old_story['url'],
        'media_id': new_medium['media_id'],
        'guid': old_story['guid'],
        'publish_date': old_story['publish_date'],
        'collect_date': sql_now(),
        'description': old_story['description'],
        'title': old_story['title']
    }

    story = db.create('stories', story)
    add_to_topic_stories(db=db,
                         story=story,
                         topic=topic,
                         valid_foreign_rss_story=True)

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, stm.tags_id from stories_tags_map stm where stm.stories_id = %(b)s
        """, {
            'a': story['stories_id'],
            'b': old_story['stories_id']
        })

    feed = get_spider_feed(db, new_medium)
    db.create('feeds_stories_map', {
        'feeds_id': feed['feeds_id'],
        'stories_id': story['stories_id']
    })

    old_download = db.query(
        "select * from downloads where stories_id = %(a)s order by downloads_id limit 1",
        {
            'a': old_story['stories_id']
        }).hash()
    download = create_download_for_new_story(db, story, feed)

    if old_download is not None:
        try:
            content = fetch_content(db, old_download)
            download = store_content(db, download, content)
        except (McDBIDownloadsException, McAmazonS3StoreException):
            download_update = dict([
                (f, old_download[f])
                for f in ['state', 'error_message', 'download_time']
            ])
            db.update_by_id('downloads', download['downloads_id'],
                            download_update)

        db.query(
            """
            insert into download_texts (downloads_id, download_text, download_text_length)
                select %(a)s, dt.download_text, dt.download_text_length
                    from download_texts dt
                    where dt.downloads_id = %(a)s
            """, {'a': download['downloads_id']})

    # noinspection SqlInsertValues
    db.query(
        f"""
        insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date, language)
            select {int(story['stories_id'])} as stories_id, sentence_number, sentence, media_id, publish_date, language
                from story_sentences
                where stories_id = %(b)s
        """, {'b': old_story['stories_id']})

    return story

示例#12

0

显示文件

文件： stories.py 项目： vishalbelsare/mediacloud

def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict:
    """Copy story to new medium.

    Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on.
    Return the new story.
    """

    story = db.create(
        'stories',
        {
            'url': old_story['url'],
            'media_id': new_medium['media_id'],
            'guid': old_story['guid'],
            'publish_date': old_story['publish_date'],
            'collect_date': sql_now(),
            'description': old_story['description'],
            'title': old_story['title']
        },
    )
    add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True)

    for old_story_tag in db.query(
        """
        SELECT tags_id
        FROM stories_tags_map
        WHERE stories_id = %(stories_id)s
        ORDER BY tags_id
        """,
        {'stories_id': old_story['stories_id']},
    ).hashes():
        stories_id = story['stories_id']
        tags_id = old_story_tag['tags_id']

        db.query("""
            INSERT INTO stories_tags_map (stories_id, tags_id)
            VALUES (%(stories_id)s, %(tags_id)s)
            ON CONFLICT (stories_id, tags_id) DO NOTHING
        """, {
            'stories_id': stories_id,
            'tags_id': tags_id,
        })

    feed = get_spider_feed(db, new_medium)
    db.create('feeds_stories_map', {'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id']})

    old_download = db.query(
        """
            SELECT *
            FROM downloads
            WHERE stories_id = %(stories_id)s
            ORDER BY downloads_id
            LIMIT 1
        """,
        {
            'stories_id': old_story['stories_id'],
        }
    ).hash()
    download = create_download_for_new_story(db, story, feed)

    if old_download is not None:
        try:
            content = fetch_content(db, old_download)
            download = store_content(db, download, content)
        except (McDBIDownloadsException, McAmazonS3StoreException):
            download_update = dict([(f, old_download[f]) for f in ['state', 'error_message', 'download_time']])
            db.update_by_id('downloads', download['downloads_id'], download_update)

        db.query(
            """
                INSERT INTO download_texts (
                    downloads_id,
                    download_text,
                    download_text_length
                )
                    SELECT
                        %(downloads_id)s,
                        dt.download_text,
                        dt.download_text_length
                    FROM download_texts AS dt
                    WHERE dt.downloads_id = %(downloads_id)s
            """,
            {
                'downloads_id': download['downloads_id'],
            },
        )

    # noinspection SqlInsertValues
    db.query(
        """
            INSERT INTO story_sentences (
                stories_id,
                sentence_number,
                sentence,
                media_id,
                publish_date,
                language
            )
                SELECT
                    %(new_stories_id)s,
                    sentence_number,
                    sentence,
                    media_id,
                    publish_date,
                    language
                FROM story_sentences
                WHERE stories_id = %(old_stories_id)s
        """,
        {
            'old_stories_id': old_story['stories_id'],
            'new_stories_id': int(story['stories_id']),
        },
    )

    return story

示例#13

0

显示文件

文件： stories.py 项目： vishalbelsare/mediacloud

def merge_foreign_rss_stories(db: DatabaseHandler, topic: dict) -> None:
    """Move all topic stories with a foreign_rss_links medium from topic_stories back to topic_seed_urls."""
    topic = decode_object_from_bytes_if_needed(topic)

    stories = db.query("""
        WITH topic_stories_from_topic AS (
            SELECT stories_id
            FROM topic_stories
            WHERE
                topics_id = %(topics_id)s AND
                (NOT valid_foreign_rss_story)
        )

        SELECT stories.*
        FROM stories
            INNER JOIN media ON
                stories.media_id = media.media_id AND
                media.foreign_rss_links
        WHERE stories.stories_id IN (
            SELECT stories_id
            FROM topic_stories_from_topic
        )
    """, {
        'topics_id': topic['topics_id'],
    }).hashes()

    for story in stories:
        download = db.query(
            """
                SELECT *
                FROM downloads
                WHERE stories_id = %(stories_id)s
                ORDER BY downloads_id
                LIMIT 1
            """,
            {
                'stories_id': story['stories_id'],
            }
        ).hash()

        content = ''
        try:
            content = fetch_content(db, download)
        except Exception as ex:
            log.warning(f"Unable to fetch content for download {download['downloads_id']}: {ex}")

        # postgres will complain if the content has a null in it
        content = content.replace('\x00', '')

        db.begin()
        db.create(
            'topic_seed_urls',
            {
                'topics_id': topic['topics_id'],
                'url': story['url'],
                'source': 'merge_foreign_rss_stories',
                'content': content,
            },
        )

        db.query(
            """
                UPDATE topic_links SET
                    ref_stories_id = NULL,
                    link_spidered = 'f'
                WHERE
                    topics_id = %(topics_id)s AND
                    ref_stories_id = %(ref_stories_id)s
            """,
            {
                'ref_stories_id': story['stories_id'],
                'topics_id': topic['topics_id'],
            },
        )

        db.query(
            """
            DELETE FROM topic_stories
            WHERE
                stories_id = %(stories_id)s AND
                topics_id = %(topics_id)s
            """,
            {
                'stories_id': story['stories_id'],
                'topics_id': topic['topics_id'],
            },
        )

        db.commit()

示例#14

0

显示文件

文件： test_workflow.py 项目： vishalbelsare/mediacloud

async def test_workflow():
    db = connect_to_db()

    test_medium = create_test_medium(db=db, label='test')
    test_feed = create_test_feed(db=db, label='test', medium=test_medium)

    # 'label' is important as it will be stored in both stories.title and stories.description, which in turn will be
    # used to guess the probable language of the podcast episode
    test_story = create_test_story(db=db,
                                   label='keeping up with Kardashians',
                                   feed=test_feed)

    stories_id = test_story['stories_id']

    with open(TEST_MP3_PATH, mode='rb') as f:
        test_mp3_data = f.read()

    # noinspection PyUnusedLocal
    def __mp3_callback(request: HashServer.Request) -> Union[str, bytes]:
        response = "".encode('utf-8')
        response += "HTTP/1.0 200 OK\r\n".encode('utf-8')
        response += "Content-Type: audio/mpeg\r\n".encode('utf-8')
        response += f"Content-Length: {len(test_mp3_data)}\r\n".encode('utf-8')
        response += "\r\n".encode('utf-8')
        response += test_mp3_data
        return response

    port = random_unused_port()
    pages = {
        '/test.mp3': {
            'callback': __mp3_callback,
        }
    }

    hs = HashServer(port=port, pages=pages)
    hs.start()

    # Not localhost as this might get fetched from a remote worker
    mp3_url = hs.page_url('/test.mp3')

    db.insert(table='story_enclosures',
              insert_hash={
                  'stories_id': stories_id,
                  'url': mp3_url,
                  'mime_type': 'audio/mpeg',
                  'length': len(test_mp3_data),
              })

    client = workflow_client()

    # Start worker
    factory = WorkerFactory(client=client, namespace=client.namespace)
    worker = factory.new_worker(task_queue=TASK_QUEUE)

    # Use an activities implementation with random GCS prefixes set
    activities = _RandomPrefixesPodcastTranscribeActivities()

    worker.register_activities_implementation(
        activities_instance=activities,
        activities_cls_name=PodcastTranscribeActivities.__name__,
    )
    worker.register_workflow_implementation_type(
        impl_cls=PodcastTranscribeWorkflowImpl)
    factory.start()

    # Initialize workflow instance
    workflow: PodcastTranscribeWorkflow = client.new_workflow_stub(
        cls=PodcastTranscribeWorkflow,
        workflow_options=WorkflowOptions(
            workflow_id=str(stories_id),

            # By default, if individual activities of the workflow fail, they will get restarted pretty much
            # indefinitely, and so this test might run for days (or rather just timeout on the CI). So we cap the
            # workflow so that if it doesn't manage to complete in X minutes, we consider it as failed.
            workflow_run_timeout=timedelta(minutes=5),
        ),
    )

    # Wait for the workflow to complete
    await workflow.transcribe_episode(stories_id)

    downloads = db.select(table='downloads', what_to_select='*').hashes()
    assert len(downloads) == 1
    first_download = downloads[0]
    assert first_download['stories_id'] == stories_id
    assert first_download['type'] == 'content'
    assert first_download['state'] == 'success'

    download_content = fetch_content(db=db, download=first_download)

    # It's what gets said in the sample MP3 file
    assert 'Kim Kardashian' in download_content

    # Initiate the worker shutdown in the background while we do the GCS cleanup so that the stop_workers_faster()
    # doesn't have to wait that long
    await worker.stop(background=True)

    log.info("Cleaning up GCS...")
    GCSStore(bucket_config=activities.config.raw_enclosures()).delete_object(
        object_id=str(stories_id))
    GCSStore(
        bucket_config=activities.config.transcoded_episodes()).delete_object(
            object_id=str(stories_id))
    GCSStore(bucket_config=activities.config.transcripts()).delete_object(
        object_id=str(stories_id))
    log.info("Cleaned up GCS")

    log.info("Stopping workers...")
    await stop_worker_faster(worker)
    log.info("Stopped workers")

示例#15

0

显示文件

def test_generate_story():
    """Test generate_story()."""
    db = connect_to_db()

    story_content = '<title>foo bar</title><meta content="2016-01-12T03:55:46Z" itemprop="datePublished"/>'
    story_url = 'http://foo.com/foo/bar'
    story = generate_story(db=db, url=story_url, content=story_content)

    assert 'stories_id' in story
    assert story['title'] == 'foo bar'
    assert story['publish_date'] == '2016-01-12 03:55:46'
    assert story['url'] == story_url
    assert story['guid'] == story_url

    medium = db.require_by_id('media', story['media_id'])

    assert medium['name'] == 'foo.com'
    assert medium['url'] == 'http://foo.com/'

    feed = db.query(
        """
        SELECT f.*
        FROM feeds_stories_map AS fsm
            INNER JOIN feeds AS f ON
                fsm.feeds_id = f.feeds_id
        WHERE fsm.stories_id = %(stories_id)s
    """, {
            'stories_id': story['stories_id'],
        }).hash()

    assert feed is not None
    assert feed['name'] == SPIDER_FEED_NAME

    (date_tag, date_tag_set) = get_story_date_tag(db, story)

    assert date_tag['tag'] == 'guess_by_tag_meta'
    assert date_tag_set['name'] == GUESS_METHOD_TAG_SET

    download = db.query(
        """
        SELECT *
        FROM downloads
        WHERE stories_id = %(stories_id)s
    """, {
            'stories_id': story['stories_id'],
        }).hash()

    assert download is not None
    assert download['url'] == story['url']

    content = fetch_content(db, download)

    assert content == story_content

    story = generate_story(
        db=db,
        url='http://fallback.date',
        content='foo',
    )

    assert story['publish_date'] == None

    matched_story = generate_story(db, story['url'], 'foo')
    assert matched_story['stories_id'] == story['stories_id']

    story = generate_story(db=db, url='invalid url', content='foo')

    assert story is not None