예제 #1
0
    def store_transcript(cls, db: DatabaseHandler,
                         transcript: Transcript) -> int:
        story = db.find_by_id(table='stories', object_id=transcript.stories_id)

        feed = db.query(
            """
            SELECT *
            FROM feeds
            WHERE feeds_id = (
                SELECT feeds_id
                FROM feeds_stories_map
                WHERE stories_id = %(stories_id)s
            )
        """, {
                'stories_id': transcript.stories_id,
            }).hash()

        download = create_download_for_new_story(db=db, story=story, feed=feed)

        text = cls._download_text_from_transcript(transcript=transcript)

        # Store as a raw download and then let "extract-and-vector" app "extract" the stored text later
        store_content(db=db, download=download, content=text)

        return download['downloads_id']
예제 #2
0
    def test_skip_self_links(self):
        """Test that self links are skipped within extract_links_for_topic_story"""

        story_domain = get_url_distinctive_domain(self.test_story['url'])

        topic = create_test_topic(self.db, 'links')
        self.db.create(
            'topic_stories', {
                'topics_id': topic['topics_id'],
                'stories_id': self.test_story['stories_id']
            })

        num_links = MAX_SELF_LINKS * 2
        content = ''
        for i in range(num_links):
            plain_text = "Sample sentence to make sure the links get extracted" * 10
            url = "http://%s/%d" % (story_domain, i)
            paragraph = "<p>%s <a href='%s'>link</a></p>\n\n" % (plain_text,
                                                                 url)
            content = content + paragraph

        store_content(self.db, self.test_download, content)

        extract_links_for_topic_story(db=self.db,
                                      stories_id=self.test_story['stories_id'],
                                      topics_id=topic['topics_id'])

        topic_links = self.db.query(
            "select * from topic_links where topics_id = %(a)s", {
                'a': topic['topics_id']
            }).hashes()

        assert (len(topic_links) == MAX_SELF_LINKS)
예제 #3
0
    async def fetch_store_transcript(self, stories_id: int) -> None:

        log.info(f"Fetching and storing transcript for story {stories_id}...")

        with tempfile.TemporaryDirectory(
                prefix='fetch_store_transcript') as temp_dir:
            transcript_json_path = os.path.join(temp_dir, 'transcript.json')

            gcs = GCSStore(bucket_config=self.config.transcripts())
            gcs.download_object(object_id=str(stories_id),
                                local_file_path=transcript_json_path)

            with open(transcript_json_path, 'r') as f:
                transcript_json = f.read()

        transcript = Transcript.from_dict(decode_json(transcript_json))

        db = connect_to_db_or_raise()

        story = db.find_by_id(table='stories', object_id=stories_id)

        feed = db.query(
            """
            SELECT *
            FROM feeds
            WHERE feeds_id = (
                SELECT feeds_id
                FROM feeds_stories_map
                WHERE stories_id = %(stories_id)s
            )
        """, {
                'stories_id': stories_id,
            }).hash()

        # Just like create_download_for_new_story(), it creates a new download except that it tests if such a download
        # exists first
        download = db.find_or_create(
            table='downloads',
            insert_hash={
                'feeds_id': feed['feeds_id'],
                'stories_id': story['stories_id'],
                'url': story['url'],
                'host': get_url_host(story['url']),
                'type': 'content',
                'sequence': 1,
                'state': 'success',
                'path': 'content:pending',
                'priority': 1,
                'extracted': 'f'
            },
        )

        text = transcript.download_text_from_transcript()

        # Store as a raw download and then let "extract-and-vector" app "extract" the stored text later
        store_content(db=db, download=download, content=text)

        log.info(
            f"Done fetching and storing transcript for story {stories_id}")
예제 #4
0
    def test_get_extracted_html(self) -> None:
        content = '<html><head><meta foo="bar" /></head><body>foo</body></html>'

        store_content(self.db, self.test_download, content)

        extracted_html = _get_extracted_html(self.db, self.test_story)

        assert extracted_html.strip(
        ) == '<body id="readabilityBody">foo</body>'
예제 #5
0
    def setUp(self) -> None:
        """Set config for tests."""
        super().setUp()

        self.db = connect_to_db()

        self.test_medium = create_test_medium(self.db, 'downloads test')
        self.test_feed = create_test_feed(self.db, 'downloads test', self.test_medium)
        self.test_download_feed = create_download_for_feed(self.db, self.test_feed)
        self.test_story = create_test_story(self.db, label='downloads est', feed=self.test_feed)
        self.test_download = create_download_for_story(self.db, feed=self.test_feed, story=self.test_story)

        store_content(db=self.db, download=self.test_download, content=self.__TEST_CONTENT)
    def test_store_content(self) -> None:
        """Test store_content by calling store_content and then calling fetch_content() on the postgresql store."""

        amazon_s3_downloads_config = _default_amazon_s3_downloads_config()

        class DoNotReadAllFromS3DownloadStorageConfig(DownloadStorageConfig):
            @staticmethod
            def read_all_from_s3():
                return False

            @staticmethod
            def fallback_postgresql_to_s3():
                return False

            @staticmethod
            def storage_locations():
                return ['postgresql']

        store = _get_store_for_reading(
            download=self.test_download,
            amazon_s3_downloads_config=amazon_s3_downloads_config,
            download_storage_config=DoNotReadAllFromS3DownloadStorageConfig(),
        )

        content = 'bat baz bar foo'
        got_download = store_content(db=self._db,
                                     download=self.test_download,
                                     content=content)
        got_content = store.fetch_content(
            db=self._db,
            object_id=self.test_download['downloads_id']).decode()

        assert got_content == content
        assert got_download['state'] == 'success'
        assert got_download['path'] == 'postgresql:raw_downloads'
        assert got_download['error_message'] == ''

        content = 'bat baz bar'
        self.test_download['state'] = 'feed_error'
        got_download = store_content(db=self._db,
                                     download=self.test_download,
                                     content=content)
        got_content = store.fetch_content(
            db=self._db,
            object_id=self.test_download['downloads_id']).decode()

        assert got_content == content
        assert got_download['state'] == 'feed_error'
        assert got_download['path'] == 'postgresql:raw_downloads'
        assert not got_download['error_message']  # NULL or an empty string
예제 #7
0
    def test_get_youtube_embed_links(self) -> None:

        youtube_html = """
        <iframe src="http://youtube.com/embed/1234" />
        <img src="http://foo.com/foo.png" />
        <iframe src="http://youtube-embed.com/embed/3456" />
        <iframe src="http://bar.com" />
        """

        store_content(self.db, self.test_download, youtube_html)

        links = _get_youtube_embed_links(self.db, self.test_story)

        assert links == [
            'http://youtube.com/embed/1234', 'http://youtube.com/embed/3456'
        ]
    def setUp(self):
        """Create test_story and test_download."""
        super().setUp()
        self.db = connect_to_db()

        media = create_test_story_stack(self.db, {'A': {'B': [1]}})

        story = media['A']['feeds']['B']['stories']['1']

        download = create_download_for_story(
            db=self.db,
            feed=media['A']['feeds']['B'],
            story=story,
        )

        store_content(self.db, download, '<p>foo</p>')

        self.test_story = story
        self.test_download = download
예제 #9
0
def store_and_verify_content(db: DatabaseHandler, download: dict, content: str) -> None:
    """Call store content and then poll verifying that the content has been stored.

    Only return once we have verified that the content has been stored.  Raise an error after a
    timeout if the content is not found.  It seems like S3 content is not available for fetching until a small
    delay after writing it.  This function makes sure the content is there once the store operation is done.
    """
    store_content(db, download, content)

    tries = 0
    while True:
        try:
            fetch_content(db, download)
            break
        except Exception as e:
            if tries > STORE_CONTENT_TIMEOUT:
                raise e

            log.debug(f"story_and_verify_content: waiting to retry verification ({tries}) ...")
            tries += 1
            time.sleep(1)
예제 #10
0
    def test_extract(self) -> None:
        """Test extract()."""

        html = '<script>ignore</script><p>foo</p>'
        store_content(self.db, self.test_download, html)
        result = extract(db=self.db, download=self.test_download)

        assert result['extracted_html'].strip(
        ) == '<body id="readabilityBody"><p>foo</p></body>'
        assert result['extracted_text'].strip() == 'foo.'

        store_content(self.db, self.test_download, html)
        extract(
            db=self.db,
            download=self.test_download,
            extractor_args=PyExtractorArguments(use_cache=True),
        )
        store_content(self.db, self.test_download, 'bar')
        result = extract(
            db=self.db,
            download=self.test_download,
            extractor_args=PyExtractorArguments(use_cache=True),
        )
        assert result['extracted_html'].strip(
        ) == '<body id="readabilityBody"><p>foo</p></body>'
        assert result['extracted_text'].strip() == 'foo.'
    def test_get_links_from_story(self):
        """Test get_links_from_story()."""

        self.test_story['title'] = 'http://title.text'
        self.test_story['description'] = '<a href="http://description.link" />http://description.text'
        self.db.update_by_id('stories', self.test_story['stories_id'], self.test_story)

        html_content = """
        <p>Here is a content <a href="http://content.1.link">link</a>.</p>
        <p>Here is another content <a href="http://content.2.link" />link</a>.</p>
        <p>Here is a duplicate content <a href="http://content.2.link" />link</a>.</p>
        <p>Here is a duplicate text <a href="http://link-text.dup" />link</a>.</p>
        <p>Here is a youtube embed:</p>
        <iframe src="http://youtube-embed.com/embed/123456" />
        """

        download_text = dict()
        download_text['downloads_id'] = self.test_download['downloads_id']
        download_text['download_text'] = "http://text.1.link http://text.2.link http://text.2.link http://link-text.dup"
        download_text['download_text_length'] = len(download_text['download_text'])
        self.db.create('download_texts', download_text)

        expected_links = """
        http://content.1.link
        http://content.2.link
        http://youtube.com/embed/123456
        http://title.text
        http://description.link
        http://description.text
        http://text.1.link
        http://text.2.link
        http://link-text.dup
        """.split()

        store_content(self.db, self.test_download, html_content)

        links = _get_links_from_story(self.db, self.test_story)

        assert sorted(links) == sorted(expected_links)
예제 #12
0
    def store_download(self, db: DatabaseHandler, download: dict,
                       content: str) -> List[int]:
        download = decode_object_from_bytes_if_needed(download)
        content = decode_object_from_bytes_if_needed(content)

        downloads_id = download['downloads_id']
        stories_id = download['stories_id']

        if not downloads_id:
            raise McCrawlerFetcherHardError("'downloads_id' is empty.")

        if not stories_id:
            raise McCrawlerFetcherHardError("'stories_id' is empty.")

        if content is None:
            # Content might be empty but not None
            raise McCrawlerFetcherHardError(
                f"Content for download {downloads_id}, story {stories_id} is None."
            )

        log.info(
            f"Processing content download {downloads_id} (story {stories_id})..."
        )

        if len(content) == 0:
            log.warning(
                f"Content for download {downloads_id}, story {stories_id} is empty."
            )

        download = store_content(db=db, download=download, content=content)

        log.info(
            f"Done processing content download {downloads_id} (story {stories_id})"
        )

        story_ids_to_extract = [
            download['stories_id'],
        ]

        return story_ids_to_extract
def test_merge_foreign_rss_stories():
    """Test merge_foreign_rss_stories()."""
    db = connect_to_db()

    topic = create_test_topic(db, 'foo')

    medium = create_test_medium(db, 'norss')
    feed = create_test_feed(db=db, label='norss', medium=medium)
    num_stories = 10
    stories = [
        create_test_story(db=db, label=str(i), feed=feed)
        for i in range(num_stories)
    ]

    rss_medium = create_test_medium(db, 'rss')
    rss_medium = db.query(
        """
        UPDATE media SET
            foreign_rss_links = 't'
        WHERE media_id = %(media_id)s
        RETURNING *
    """, {
            'media_id': rss_medium['media_id'],
        }).hash()
    rss_feed = create_test_feed(db=db, label='rss', medium=rss_medium)
    num_rss_stories = 10
    rss_stories = []
    for i in range(num_rss_stories):
        story = create_test_story(db=db, label=str(i), feed=rss_feed)
        download = db.create(
            'downloads', {
                'stories_id': story['stories_id'],
                'feeds_id': rss_feed['feeds_id'],
                'url': story['url'],
                'host': 'foo',
                'type': 'content',
                'state': 'success',
                'priority': 0,
                'sequence': 0,
                'path': 'postgresql'
            })
        store_content(db, download, story['title'])
        rss_stories.append(story)

    # noinspection SqlInsertValues
    db.query(
        """
        INSERT INTO topic_stories (
            stories_id,
            topics_id
        )
            SELECT
                stories_id,
                %(topics_id)s AS topics_id
            FROM stories
    """, {
            'topics_id': int(topic['topics_id']),
        })

    assert db.query("SELECT COUNT(*) FROM topic_stories").flat(
    )[0] == num_stories + num_rss_stories

    merge_foreign_rss_stories(db, topic)

    assert db.query(
        "SELECT COUNT(*) FROM topic_stories").flat()[0] == num_stories
    assert db.query(
        "SELECT COUNT(*) FROM topic_seed_urls").flat()[0] == num_rss_stories

    got_topic_stories_ids = db.query(
        "SELECT stories_id FROM topic_stories").flat()
    expected_topic_stories_ids = [s['stories_id'] for s in stories]
    assert sorted(got_topic_stories_ids) == sorted(expected_topic_stories_ids)

    got_seed_urls = db.query(
        """
        SELECT
            topics_id,
            url,
            content
        FROM topic_seed_urls
        WHERE topics_id = %(topics_id)s
    """, {
            'topics_id': topic['topics_id'],
        }).hashes()
    expected_seed_urls = \
        [{'url': s['url'], 'topics_id': topic['topics_id'], 'content': s['title']} for s in rss_stories]

    assert sorted(got_seed_urls,
                  key=itemgetter('url')) == sorted(expected_seed_urls,
                                                   key=itemgetter('url'))
예제 #14
0
def add_content_to_test_story(db: DatabaseHandler, story: dict,
                              feed: dict) -> dict:
    """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download
    store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content()."""

    story = decode_object_from_bytes_if_needed(story)
    feed = decode_object_from_bytes_if_needed(feed)

    content_language_code = None
    if 'content' in story:
        content = story['content']
        content_language_code = language_code_for_text(content)
    else:
        content = _get_test_content()

    # If language code was undetermined, or if we're using Latin test content
    if not content_language_code:
        content_language_code = 'en'

    if story.get('full_text_rss', None):
        story['full_text_rss'] = False
        db.update_by_id(
            table='stories',
            object_id=story['stories_id'],
            update_hash={
                'full_text_rss': False,
                'language': content_language_code,
            },
        )

    host = get_url_host(feed['url'])

    download = db.create(table='downloads',
                         insert_hash={
                             'feeds_id': feed['feeds_id'],
                             'url': story['url'],
                             'host': host,
                             'type': 'content',
                             'sequence': 1,
                             'state': 'fetching',
                             'priority': 1,
                             'extracted': True,
                             'stories_id': story['stories_id'],
                         })

    download = store_content(db=db, download=download, content=content)

    extracted_content = html_strip(content)

    story['download'] = download
    story['content'] = extracted_content

    db.query(
        """
        INSERT INTO download_texts (downloads_id, download_text, download_text_length)
        VALUES (%(downloads_id)s, %(download_text)s, CHAR_LENGTH(%(download_text)s))
    """, {
            'downloads_id': download['downloads_id'],
            'download_text': extracted_content,
        })

    lang = LanguageFactory.language_for_code(content_language_code)
    assert lang, f"Language is None for code {content_language_code}"

    sentences = lang.split_text_to_sentences(extracted_content)
    sentence_number = 1
    for sentence in sentences:
        db.insert(table='story_sentences',
                  insert_hash={
                      'sentence': sentence,
                      'language': language_code_for_text(sentence) or 'en',
                      'sentence_number': sentence_number,
                      'stories_id': story['stories_id'],
                      'media_id': story['media_id'],
                      'publish_date': story['publish_date'],
                  })
        sentence_number += 1

    mark_as_processed(db=db, stories_id=story['stories_id'])

    story['download_text'] = db.query(
        """
        SELECT *
        FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {
            'downloads_id': download['downloads_id']
        }).hash()

    if not story['download_text']:
        raise McAddContentToTestStoryException("Unable to find download_text")

    return story
예제 #15
0
파일: stories.py 프로젝트: rleir/mediacloud
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict,
                             new_medium: dict) -> dict:
    """Copy story to new medium.

    Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on.
    Return the new story.
    """

    story = {
        'url': old_story['url'],
        'media_id': new_medium['media_id'],
        'guid': old_story['guid'],
        'publish_date': old_story['publish_date'],
        'collect_date': sql_now(),
        'description': old_story['description'],
        'title': old_story['title']
    }

    story = db.create('stories', story)
    add_to_topic_stories(db=db,
                         story=story,
                         topic=topic,
                         valid_foreign_rss_story=True)

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, stm.tags_id from stories_tags_map stm where stm.stories_id = %(b)s
        """, {
            'a': story['stories_id'],
            'b': old_story['stories_id']
        })

    feed = get_spider_feed(db, new_medium)
    db.create('feeds_stories_map', {
        'feeds_id': feed['feeds_id'],
        'stories_id': story['stories_id']
    })

    old_download = db.query(
        "select * from downloads where stories_id = %(a)s order by downloads_id limit 1",
        {
            'a': old_story['stories_id']
        }).hash()
    download = create_download_for_new_story(db, story, feed)

    if old_download is not None:
        try:
            content = fetch_content(db, old_download)
            download = store_content(db, download, content)
        except (McDBIDownloadsException, McAmazonS3StoreException):
            download_update = dict([
                (f, old_download[f])
                for f in ['state', 'error_message', 'download_time']
            ])
            db.update_by_id('downloads', download['downloads_id'],
                            download_update)

        db.query(
            """
            insert into download_texts (downloads_id, download_text, download_text_length)
                select %(a)s, dt.download_text, dt.download_text_length
                    from download_texts dt
                    where dt.downloads_id = %(a)s
            """, {'a': download['downloads_id']})

    # noinspection SqlInsertValues
    db.query(
        f"""
        insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date, language)
            select {int(story['stories_id'])} as stories_id, sentence_number, sentence, media_id, publish_date, language
                from story_sentences
                where stories_id = %(b)s
        """, {'b': old_story['stories_id']})

    return story
예제 #16
0
파일: stories.py 프로젝트: rleir/mediacloud
def generate_story(db: DatabaseHandler,
                   url: str,
                   content: str,
                   title: str = None,
                   publish_date: str = None,
                   fallback_date: Optional[str] = None) -> dict:
    """Add a new story to the database by guessing metadata using the given url and content.

    This function guesses the medium, feed, title, and date of the story from the url and content.

    If inserting the story results in a unique constraint error based on media_id and url, return
    the existing story instead.

    Arguments:
    db - db handle
    url - story url
    content - story content
    fallback_date - fallback to this date if the date guesser fails to find a date
    """
    if len(url) < 1:
        raise McTMStoriesException("url must not be an empty string")

    url = url[0:MAX_URL_LENGTH]

    medium = guess_medium(db, url)
    feed = get_spider_feed(db, medium)
    spidered_tag = get_spidered_tag(db)

    if title is None:
        title = html_title(content, url, MAX_TITLE_LENGTH)

    story = {
        'url': url,
        'guid': url,
        'media_id': medium['media_id'],
        'title': title,
        'description': ''
    }

    # postgres refuses to insert text values with the null character
    for field in ('url', 'guid', 'title'):
        story[field] = re2.sub('\x00', '', story[field])

    date_guess = None
    if publish_date is None:
        date_guess = guess_date(url, content)
        story[
            'publish_date'] = date_guess.date if date_guess.found else fallback_date
        if story['publish_date'] is None:
            story['publish_date'] = datetime.datetime.now().isoformat()
    else:
        story['publish_date'] = publish_date

    story = add_story(db, story, feed['feeds_id'])

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, %(b)s
            where not exists (
                select 1
                from stories_tags_map
                where stories_id = %(a)s
                  and tags_id = %(b)s
            )
        """, {
            'a': story['stories_id'],
            'b': spidered_tag['tags_id']
        })

    if publish_date is None:
        assign_date_guess_tag(db, story, date_guess, fallback_date)

    log.debug("add story: %s; %s; %s; %d" %
              (story['title'], story['url'], story['publish_date'],
               story['stories_id']))

    if story.get('is_new', False):
        download = create_download_for_new_story(db, story, feed)
        store_content(db, download, content)
        _extract_story(story)

    return story
예제 #17
0
    def store_download(self, db: DatabaseHandler, download: dict,
                       content: str) -> List[int]:
        download = decode_object_from_bytes_if_needed(download)
        content = decode_object_from_bytes_if_needed(content)

        downloads_id = download['downloads_id']

        log.info(f"Processing feed download {downloads_id}...")

        try:
            added_story_ids = self.add_stories_from_feed(db=db,
                                                         download=download,
                                                         content=content)
            story_ids_to_extract = self.add_stories_from_feed(
                db=db, download=download, content=content)

        except Exception as ex:

            error_message = f"Error processing feed for download {downloads_id}: {ex}"
            log.error(error_message)

            db.query(
                """
                UPDATE downloads
                SET state = 'feed_error',
                    error_message = %(error_message)s
                WHERE downloads_id = %(downloads_id)s
            """, {
                    'error_message': error_message,
                    'downloads_id': downloads_id,
                })

            # On non-soft errors (explicitly hard errors or unknown errors), pass the exception up
            if not isinstance(ex, McCrawlerFetcherSoftError):
                raise ex

            story_ids_to_extract = []

        else:

            if len(added_story_ids):
                last_new_story_time_sql = 'last_new_story_time = last_attempted_download_time, '
            else:
                last_new_story_time_sql = ''

            db.query(
                f"""
                UPDATE feeds
                SET {last_new_story_time_sql}
                    last_successful_download_time = GREATEST(last_successful_download_time, %(download_time)s)
                WHERE feeds_id = %(feeds_id)s
            """, {
                    'download_time': download['download_time'],
                    'feeds_id': download['feeds_id'],
                })

            # If no new stories, just store "(redundant feed)" to save storage space
            if len(added_story_ids) == 0:
                content = '(redundant feed)'

        # Reread the possibly updated download
        download = db.find_by_id(table='downloads', object_id=downloads_id)

        # Store the feed in any case
        store_content(db=db, download=download, content=content)

        log.info(f"Done processing feed download {downloads_id}")

        return story_ids_to_extract
예제 #18
0
def test_merge_foreign_rss_stories():
    """Test merge_foreign_rss_stories()."""
    db = connect_to_db()

    topic = create_test_topic(db, 'foo')

    medium = create_test_medium(db, 'norss')
    feed = create_test_feed(db=db, label='norss', medium=medium)
    num_stories = 10
    stories = [
        create_test_story(db=db, label=str(i), feed=feed)
        for i in range(num_stories)
    ]

    rss_medium = create_test_medium(db, 'rss')
    rss_medium = db.query(
        "update media set foreign_rss_links = 't' where media_id = %(a)s returning *",
        {'a': rss_medium['media_id']}).hash()
    rss_feed = create_test_feed(db=db, label='rss', medium=rss_medium)
    num_rss_stories = 10
    rss_stories = []
    for i in range(num_rss_stories):
        story = create_test_story(db=db, label=str(i), feed=rss_feed)
        download = db.create('downloads', {
            'stories_id': story['stories_id'],
            'feeds_id': rss_feed['feeds_id'],
            'url': story['url'],
            'host': 'foo',
            'type': 'content',
            'state': 'success',
            'priority': 0,
            'sequence': 0,
            'path': 'postgresql'})
        store_content(db, download, story['title'])
        rss_stories.append(story)

    # noinspection SqlInsertValues
    db.query(
        f"""
            insert into topic_stories (stories_id, topics_id)
                select s.stories_id, {int(topic['topics_id'])}
                from stories s
        """
    )

    assert db.query("select count(*) from topic_stories").flat()[0] == num_stories + num_rss_stories

    merge_foreign_rss_stories(db, topic)

    assert db.query("select count(*) from topic_stories").flat()[0] == num_stories
    assert db.query("select count(*) from topic_seed_urls").flat()[0] == num_rss_stories

    got_topic_stories_ids = db.query("select stories_id from topic_stories").flat()
    expected_topic_stories_ids = [s['stories_id'] for s in stories]
    assert sorted(got_topic_stories_ids) == sorted(expected_topic_stories_ids)

    got_seed_urls = db.query(
        "select topics_id, url, content from topic_seed_urls where topics_id = %(a)s",
        {'a': topic['topics_id']}).hashes()
    expected_seed_urls = \
        [{'url': s['url'], 'topics_id': topic['topics_id'], 'content': s['title']} for s in rss_stories]

    assert sorted(got_seed_urls, key=itemgetter('url')) == sorted(expected_seed_urls, key=itemgetter('url'))
예제 #19
0
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict:
    """Copy story to new medium.

    Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on.
    Return the new story.
    """

    story = db.create(
        'stories',
        {
            'url': old_story['url'],
            'media_id': new_medium['media_id'],
            'guid': old_story['guid'],
            'publish_date': old_story['publish_date'],
            'collect_date': sql_now(),
            'description': old_story['description'],
            'title': old_story['title']
        },
    )
    add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True)

    for old_story_tag in db.query(
        """
        SELECT tags_id
        FROM stories_tags_map
        WHERE stories_id = %(stories_id)s
        ORDER BY tags_id
        """,
        {'stories_id': old_story['stories_id']},
    ).hashes():
        stories_id = story['stories_id']
        tags_id = old_story_tag['tags_id']

        db.query("""
            INSERT INTO stories_tags_map (stories_id, tags_id)
            VALUES (%(stories_id)s, %(tags_id)s)
            ON CONFLICT (stories_id, tags_id) DO NOTHING
        """, {
            'stories_id': stories_id,
            'tags_id': tags_id,
        })

    feed = get_spider_feed(db, new_medium)
    db.create('feeds_stories_map', {'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id']})

    old_download = db.query(
        """
            SELECT *
            FROM downloads
            WHERE stories_id = %(stories_id)s
            ORDER BY downloads_id
            LIMIT 1
        """,
        {
            'stories_id': old_story['stories_id'],
        }
    ).hash()
    download = create_download_for_new_story(db, story, feed)

    if old_download is not None:
        try:
            content = fetch_content(db, old_download)
            download = store_content(db, download, content)
        except (McDBIDownloadsException, McAmazonS3StoreException):
            download_update = dict([(f, old_download[f]) for f in ['state', 'error_message', 'download_time']])
            db.update_by_id('downloads', download['downloads_id'], download_update)

        db.query(
            """
                INSERT INTO download_texts (
                    downloads_id,
                    download_text,
                    download_text_length
                )
                    SELECT
                        %(downloads_id)s,
                        dt.download_text,
                        dt.download_text_length
                    FROM download_texts AS dt
                    WHERE dt.downloads_id = %(downloads_id)s
            """,
            {
                'downloads_id': download['downloads_id'],
            },
        )

    # noinspection SqlInsertValues
    db.query(
        """
            INSERT INTO story_sentences (
                stories_id,
                sentence_number,
                sentence,
                media_id,
                publish_date,
                language
            )
                SELECT
                    %(new_stories_id)s,
                    sentence_number,
                    sentence,
                    media_id,
                    publish_date,
                    language
                FROM story_sentences
                WHERE stories_id = %(old_stories_id)s
        """,
        {
            'old_stories_id': old_story['stories_id'],
            'new_stories_id': int(story['stories_id']),
        },
    )

    return story